# Write print and then () and inside bracket add "" and write any thing you want to print
print("Hello World")
print(2+3)
print("Python with Ammar")
Hello World 5 Python with Ammar
# Addition
print(2+3)
# Subtraction
print(3-2)
# Multiplication
print(48*3)
#Divivion for getting a floating no. like 3.25
print(22/3) #floating numbers
# Division for getting a whole no
print(11//3) #for whole number
# Getting a power
print(2**3)
# Getting a percentage
print(34%2)
# Adding simultaniouly all funtion according to PEMDAS
print(2**3/2*3/3+6-4+2)
#PEMDAS
#Parenthesis Exponenets Multiply Addition Substraction
#left to right sequencefor M D & A S*
5 1 144 7.333333333333333 3 8 0 8.0
# Anything we write inside the "" is our string
print("Hello World")
print("Python with Ammar")
print('test for single quote')
print("test for double quote")
print('''test for trriple quotes''')
print("Whats's Up")
print(" what's up ?")
print("srtring_clear")
Hello World Python with Ammar test for single quote test for double quote test for trriple quotes Whats's Up what's up ? srtring_clear
# Adding anything to comment by pressing (Ctrl+/)
print("How are you?") #press these to comment out (Ctrl+/)
print (" We are learning python with ammar") #print a string
print(2+3) #print operators function withnumbers
How are you? We are learning python with ammar 5
#Variables: objects containing specific values
x=5 #Numeric or integer variable
print (x)
y= "we are learning Python with ammar" #string variable
print(y)
x= x+15
print(x) #answer will be 20 beacause lines are updating from upward to downward continuously
#types/class of variables
type(x)
print(type(x))
print(type(y))
#print_type_class
#rules to assign a variable
#1 . Variable should contains letter , numbers or underscores
#2 . Do not start with numbers Like 2y instead of y only
#3 . Spaces are not allowed in variable name
#4 . Do not use keywords used in functions like break , mean , media, test, etc
#5 . Short and Descriptive
#6 . Case sensitivity (Lowercase , upper case letters , lower case letters should be used)
fruit_basket= 10
fruit_basket= "Mangoes"
print(type (fruit_basket))
#del fruit_basket
print(fruit_basket)
5 we are learning Python with ammar 20 <class 'int'> <class 'str'> <class 'str'> Mangoes
# greetings= "Assalam-u-Alikum "
# asking=",kia hal hain?"
# print(greetings,name,asking)
#3rd Stage input function
name=input("what is your name? ")
age=input("How old are you? ")
greetings="Hello"
print(greetings,name,",you are still young bro ")
#input_ammar_ You are still young
what is your name? Muzammil How old are you? 18 Hello Muzammil ,you are still young bro
#logical operators are either "true or false or yes or no or 0 or 1"
#equal to ==
#not equal to !=
#less than <
#greater than >
#less then and equal to <=
#greater than and wqualto >=
# question.. is 4 equal to 4
# print(4==4)
# print(4!=4)
# print(4>3)
# print(5<4)
# print(3<=5)
# print(3>=5)
# #application of logical operators
# hammad_age=4
# age_at_school=5
# print(hammad_age==age_at_school)
#input function and logical operator
age_at_school= 5 #variable
hammad_age=input("What is the age of Hammad ?") #input function
hammad_age=int(hammad_age) #changing type of variavble
print(type(hammad_age))
print(hammad_age==age_at_school) #logical operator
#convert input
What is the age of Hammad ?5 <class 'int'> True
# x=10 #integer
# y=10.2 #float
# z="hello" #string
# print(type(x))
# print(type(y))
# print(type(z))
#implicit type conversion
# x=x*y
# print(x, type(x))
#explicit type conversion
# age=input("what is your age? ")
# print(type(float(age)))
# # age=int(age)
# # print(type(int(age)))
# print(age,type(str(age)))
name=input("what is your name ?")
print(name, type(str(name)))
#type_conversion
what is your name ?Muzammil Muzammil <class 'str'>
required_age_at_school=4
hammad_age=1
#question: Can hammad goto school
if hammad_age==required_age_at_school:
print("Congratulation!! Hammad can join the school")
elif hammad_age > required_age_at_school :
print("Hammad should join higher school")
elif hammad_age<=2:
print("You should take care of Hammad he is still a baby ")
else:
print("Hammad Can not go to school ")
#i, elif, else statement clear .
You should take care of Hammad he is still a baby
#1
#defining a functions
# def print_codanics():
# print("We are learning with ammar")
# print("We are learning with ammar")
# print("We are learning with ammar")
# print_codanics()
#2
# def print_code():
# text= "we are learning python with ammar "
# print(text)
# print(text)
# print(text)
# print_code()
#3
# def print_code(text):
# print(text)
# print(text)
# print(text)
# print_code("We are learning python")
#4
#defining a function with if elif and else statement
# def school_calculator(age):
# if age==5:
# print("Hammad can join the school")
# elif age>5:
# print("Hammad should go to higher school")
# else:
# print("Hammad is still a baby")
# school_calculator(5)
#defining a function of future
# def future_age(age):
# new_age= age+20
# return new_age
# print(new_age)
# # print(new_age)
# furture_age=future_age(3)
# print(furture_age)
#i understand functions really well
# def repeat_ali_4times():
# text= ("ALI")
# print(text)
# print(text)
# print(text)
# print(text)
# repeat_ali_4times()
#practice again
# text= input("what do you want to write 5 times")
# def write_4_times(text):
# print(text)
# print(text)
# print(text)
# print(text)
# write_4_times(text)
name=input("What is your name? ")
age=int(input("What is your age ? "))
greetings= ("Hello")
def school_extrance_calculator(age):
print(greetings,name)
if age>=5 and age<9:
print("You are welcome to school")
elif age<5:
print("You are not eligible")
elif age>=10 and age<15:
print("You should go to higher school")
else:
print("you should go to university")
school_extrance_calculator(age)
What is your name? Muzammil What is your age ? 8 Hello Muzammil You are welcome to school
# #while loops and for loops
# # while loops
# x=0
# while(x<=5):
# print (x)
# x=x+1
#for loop
# for x in range (4,11):
# print(x)
#array
days= ["mon", "tue", "wed", "thurs", "fri", "sat", "sund"]
for d in days:
if d=="fri": break #Stop the loop
# if d=="fri": continue #skip that entity
print(d)
mon tue wed thurs
#if you want to print the value of pi
import math
print("The value of pi is ",math.pi)
print(type(math.pi))
The value of pi is 3.141592653589793 <class 'float'>
import statistics
x= [150, 250,350,450]
print(statistics.mean(x))
#some important libraries
#numpy, pandas
The value of pi is 3.141592653589793 <class 'float'> 300
#print(We are learning) #syntax error
#print(25/0) #runtime error
name="ammar"
print("Hello", name)
#trouble shooting is easy
Hello ammar
name= input("What is your name ? ")
print(name)
age= int(input("What is your age? "))
print(age)
print(type(age))
if age==24:
print(name,"You are still young bro")
elif age<24:
print(name, "Bachay you are still a baby")
elif age>24 and age<100:
print(name ,"saab Babay ho rhe ho , shadi krwa lo")
elif age>=100 and age<200:
print(name, "Mar ja bhai ")
else:
print("Tu mar chuka hai ")
What is your name ? Muzammil Muzammil What is your age? 24 24 <class 'int'> Muzammil You are still young bro
#units of BMI is wiegh in kg / height in m and its square
name= input("What is your name ? ")
greetings= ("Hello" ,name)
greetings
weight= float(input("what is your weight? "))
height= float(input("and your height ? "))
bmi= weight/height**2
bmi
print(name ,"your BMI IS", bmi)
What is your name ? Muzammil what is your weight? 100 and your height ? 150 Muzammil your BMI IS 0.0044444444444444444
#make a string
a= "Samosa Pakora"
a
'Samosa Pakora'
#checking the value at index 0
a[0]
#counting will star from 0 to onwards in python
'S'
a[1]
'a'
a[2]
'm'
a[6]
#it will print a space
' '
len(a)
#it will show the number of index in our string
13
a[0:6]
#the last no will be excludes like i ask for letters from 0 to 6 but it will print from 0 to 5 letters
'Samosa'
a[1:8]
#here P is 7th character as we strat counting from 0
'amosa P'
a[0:13] #here if we count from 0 to 13 it will be total 14 characters here 13 no is exclusive and count complterd from 0 to 12 total as 13
'Samosa Pakora'
a[-2]
#here it starts from right side and will start from number (-1)
'r'
a[-1:-6]
#here it will not print any thing
''
a[-6:-1]
#here we see the writing sequence in string will remain same from right to left
#also -1 no is "a" but it will not print here as last no is exclude
'Pakor'
a[-6:0]
''
a[-6:13]
'Pakora'
food= "birYani"
food
'birYani'
food
'birYani'
#Checking the length
len(food)
7
# Capitalize
food.capitalize()
'Biryani'
#Upper case letters
food.upper()
'BIRYANI'
#lower case letters
food.lower()
'biryani'
#replace
food.replace("b", "sh")
'shirYani'
#counting a specific alphabet in a string
name = "baba_aammar with Dr aamar tufail"
name
'baba_aammar with Dr aamar tufail'
name.count("a")
9
name.count("D")
1
#how to find a number of index in string
name = "baba_aammar with Dr aamar tufail"
name
'baba_aammar with Dr aamar tufail'
name.find("t")
14
# how to split a string
food = "i love samosa , pakora , raita, biryani and karahi"
food
'i love samosa , pakora , raita, biryani and karahi'
food.split(",")
['i love samosa ', ' pakora ', ' raita', ' biryani and karahi']
tup1 = (1,"python" , True , 2.5)
tup1
(1, 'python', True, 2.5)
#type of a tuple
type(tup1)
tuple
tup1[1]
'python'
tup1[0]
1
tup1[0:6]
(1, 'python', True, 2.5)
tup1[0:3] # last element is exclusive
(1, 'python', True)
#length of tuple
len(tup1)
4
tup2 = (2, "baba ammar", 3.5, False )
tup2
(2, 'baba ammar', 3.5, False)
# concatinate ( TO add two or more tuple)
tup1+tup2
(1, 'python', True, 2.5, 2, 'baba ammar', 3.5, False)
#concatinate + repeat
tup1*3 + tup2
(1, 'python', True, 2.5, 1, 'python', True, 2.5, 1, 'python', True, 2.5, 2, 'baba ammar', 3.5, False)
tup1*2 + tup2
(1, 'python', True, 2.5, 1, 'python', True, 2.5, 2, 'baba ammar', 3.5, False)
tup3 = (20, 50, 60, 80, 96)
tup3
(20, 50, 60, 80, 96)
max(tup3)
96
min(tup3)
20
tup3*2
(20, 50, 60, 80, 96, 20, 50, 60, 80, 96)
mark down it and add three dashes - will creat space line
list1 = [2, "baba ammar" , False]
list1
[2, 'baba ammar', False]
type(list1)
list
len(list1)
3
list1[2]
False
list2 = [3, 5, "Aammar", "Codanics", 478, 53.2, True]
list2
[3, 5, 'Aammar', 'Codanics', 478, 53.2, True]
list1 + list2
[2, 'baba ammar', False, 3, 5, 'Aammar', 'Codanics', 478, 53.2, True]
list1*2
[2, 'baba ammar', False, 2, 'baba ammar', False]
list1
[2, 'baba ammar', False]
list1.reverse()
list1
[False, 'baba ammar', 2]
list1.append("codanics youtube channel")
list1
[False, 'baba ammar', 2, 'codanics youtube channel']
list1.count(False)
1
list3 = [20,30,40,50,60,52,562,488,2485]
list3
[20, 30, 40, 50, 60, 52, 562, 488, 2485]
len(list3)
9
#sorting a List
list3.sort()
list3
[20, 30, 40, 50, 52, 60, 488, 562, 2485]
#repeat
list3*3
[20, 30, 40, 50, 52, 60, 488, 562, 2485, 20, 30, 40, 50, 52, 60, 488, 562, 2485, 20, 30, 40, 50, 52, 60, 488, 562, 2485]
list2+list3
[3, 5, 'Aammar', 'Codanics', 478, 53.2, True, 20, 30, 40, 50, 52, 60, 488, 562, 2485]
lists= list1 +list2
lists
[False, 'baba ammar', 2, 'codanics youtube channel', 3, 5, 'Aammar', 'Codanics', 478, 53.2, True]
#Food and thier prices
food1= {"Samosa" : 30, "Pakora" : 100, "Raita" : 20, "Salad" : 50, "Chicken Rolls": 30,}
food1
{'Samosa': 30, 'Pakora': 100, 'Raita': 20, 'Salad': 50, 'Chicken Rolls': 30}
type(food1)
dict
#extract data
keys= food1.keys()
keys
dict_keys(['Samosa', 'Pakora', 'Raita', 'Salad', 'Chicken Rolls'])
values = food1.values()
values
dict_values([30, 100, 20, 50, 30])
#adding new element
food1["Tikki"]=10
food1
{'Samosa': 30,
'Pakora': 100,
'Raita': 20,
'Salad': 50,
'Chicken Rolls': 30,
'Tikki': 10}
#updating a values
food1["Tikki"]= 15
food1
{'Samosa': 30,
'Pakora': 100,
'Raita': 20,
'Salad': 50,
'Chicken Rolls': 30,
'Tikki': 15}
food2 = {"Dates": 50, "Chocolates":200, "Sawayyan":1000}
food2
{'Dates': 50, 'Chocolates': 200, 'Sawayyan': 1000}
#Concatinate
food1.update(food2)
food1
{'Samosa': 30,
'Pakora': 100,
'Raita': 20,
'Salad': 50,
'Chicken Rolls': 30,
'Tikki': 15,
'Dates': 50,
'Chocolates': 200,
'Sawayyan': 1000}
s1= {1, 2, 2.2, 5, "Codanics", "Faisalabad", True}
s1
# here we see that boolean operator dosent print in sets
{1, 2, 2.2, 5, 'Codanics', 'Faisalabad'}
s1.add("codanics")
s1
{1, 2, 2.2, 5, 'Codanics', 'Faisalabad', 'codanics'}
s1.add("Faisalabad")
s1
{1, 2, 2.2, 5, 'Codanics', 'Faisalabad', 'codanics'}
s1.remove("codanics")
s1
{1, 2, 2.2, 5, 'Codanics', 'Faisalabad'}
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="ticks", color_codes= True)
titanic = sns.load_dataset("titanic")
sns.catplot(x="sex", y="survived", hue="class", kind="bar", data=titanic)
<seaborn.axisgrid.FacetGrid at 0x14152a7df40>
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="ticks", color_codes=True)
titanic= sns.load_dataset("titanic")
titanic
| survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | deck | embark_town | alive | alone | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S | Third | man | True | NaN | Southampton | no | False |
| 1 | 1 | 1 | female | 38.0 | 1 | 0 | 71.2833 | C | First | woman | False | C | Cherbourg | yes | False |
| 2 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S | Third | woman | False | NaN | Southampton | yes | True |
| 3 | 1 | 1 | female | 35.0 | 1 | 0 | 53.1000 | S | First | woman | False | C | Southampton | yes | False |
| 4 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S | Third | man | True | NaN | Southampton | no | True |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | 0 | 2 | male | 27.0 | 0 | 0 | 13.0000 | S | Second | man | True | NaN | Southampton | no | True |
| 887 | 1 | 1 | female | 19.0 | 0 | 0 | 30.0000 | S | First | woman | False | B | Southampton | yes | True |
| 888 | 0 | 3 | female | NaN | 1 | 2 | 23.4500 | S | Third | woman | False | NaN | Southampton | no | False |
| 889 | 1 | 1 | male | 26.0 | 0 | 0 | 30.0000 | C | First | man | True | C | Cherbourg | yes | True |
| 890 | 0 | 3 | male | 32.0 | 0 | 0 | 7.7500 | Q | Third | man | True | NaN | Queenstown | no | True |
891 rows × 15 columns
p1=sns.countplot(x="who", data=titanic, hue="alone")
p1.set_title("PLot for Counting")
Text(0.5, 1.0, 'PLot for Counting')
#scatter plot
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="ticks", color_codes=True)
titanic = sns.load_dataset("titanic")
g=sns.FacetGrid(titanic, row="sex", hue= "alone")
g=(g.map(plt.scatter,"age", "fare").add_legend())
Seaborn (automatically install these libraries
numpy
import seaborn as sns
import matplotlib.pyplot as plt
#load data set
phool = sns.load_dataset("iris")
phool
#draw a line plot
# sns.lineplotin(x="", y="", data=phool)
#lie plot always work betwwen numeric values
| sepal_length | sepal_width | petal_length | petal_width | species | |
|---|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
| ... | ... | ... | ... | ... | ... |
| 145 | 6.7 | 3.0 | 5.2 | 2.3 | virginica |
| 146 | 6.3 | 2.5 | 5.0 | 1.9 | virginica |
| 147 | 6.5 | 3.0 | 5.2 | 2.0 | virginica |
| 148 | 6.2 | 3.4 | 5.4 | 2.3 | virginica |
| 149 | 5.9 | 3.0 | 5.1 | 1.8 | virginica |
150 rows × 5 columns
sns.lineplot(x="sepal_length", y="sepal_width", data=phool)
plt.show()
#Program with title
import seaborn as sns
import matplotlib.pyplot as plt
phool = sns.load_dataset("iris")
sns.lineplot(x="sepal_length", y="sepal_width", data=phool)
plt.title("Phoolo ka Plot")
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
phool = sns.load_dataset("iris")
sns.lineplot(x="sepal_length", y="sepal_width", data=phool)
plt.title("Phoolo ka Plot")
plt.xlim(2)
plt.ylim(1)
plt.show()
#to remove alreay or default style first
sns.set_style(style= None , rc=None )
import seaborn as sns
import matplotlib.pyplot as plt
phool = sns.load_dataset("iris")
sns.lineplot(x="sepal_length", y="sepal_width", data=phool)
plt.title("Phoolo ka Plot")
#sns.set_style(style= None , rc=None)
sns.set_style("darkgrid")
plt.xlim(2)
plt.ylim(1)
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
phool = sns.load_dataset("iris")
#after loading data set
#changing size of final figure
plt.figure(figsize=(12,10))
sns.lineplot(x="sepal_length", y="sepal_width", data=phool)
plt.title("Phoolo ka Plot")
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
phool = sns.load_dataset("iris")
plt.figure(figsize=(12,10))
sns.lineplot(x="sepal_length", y="sepal_width", data=phool)
plt.title("Phoolo ka Plot")
sns.set_style(style= None , rc=None)
sns.set_style("darkgrid")
plt.xlim(3)
plt.ylim(1.5)
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
phool = sns.load_dataset("iris")
phool
sns.barplot(x="species", y="sepal_width", data=phool)
plt.title("Phoolo ka Plot")
plt.show()
phool
| sepal_length | sepal_width | petal_length | petal_width | species | |
|---|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
| ... | ... | ... | ... | ... | ... |
| 145 | 6.7 | 3.0 | 5.2 | 2.3 | virginica |
| 146 | 6.3 | 2.5 | 5.0 | 1.9 | virginica |
| 147 | 6.5 | 3.0 | 5.2 | 2.0 | virginica |
| 148 | 6.2 | 3.4 | 5.4 | 2.3 | virginica |
| 149 | 5.9 | 3.0 | 5.1 | 1.8 | virginica |
150 rows × 5 columns
import seaborn as sns
import matplotlib.pyplot as plt
phool = sns.load_dataset("iris")
phool
sns.barplot(x="species", y="petal_length", data=phool)
plt.title("Phoolo ka Plot")
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
kashti = sns.load_dataset("titanic")
kashti
| survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | deck | embark_town | alive | alone | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S | Third | man | True | NaN | Southampton | no | False |
| 1 | 1 | 1 | female | 38.0 | 1 | 0 | 71.2833 | C | First | woman | False | C | Cherbourg | yes | False |
| 2 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S | Third | woman | False | NaN | Southampton | yes | True |
| 3 | 1 | 1 | female | 35.0 | 1 | 0 | 53.1000 | S | First | woman | False | C | Southampton | yes | False |
| 4 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S | Third | man | True | NaN | Southampton | no | True |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | 0 | 2 | male | 27.0 | 0 | 0 | 13.0000 | S | Second | man | True | NaN | Southampton | no | True |
| 887 | 1 | 1 | female | 19.0 | 0 | 0 | 30.0000 | S | First | woman | False | B | Southampton | yes | True |
| 888 | 0 | 3 | female | NaN | 1 | 2 | 23.4500 | S | Third | woman | False | NaN | Southampton | no | False |
| 889 | 1 | 1 | male | 26.0 | 0 | 0 | 30.0000 | C | First | man | True | C | Cherbourg | yes | True |
| 890 | 0 | 3 | male | 32.0 | 0 | 0 | 7.7500 | Q | Third | man | True | NaN | Queenstown | no | True |
891 rows × 15 columns
import seaborn as sns
import matplotlib.pyplot as plt
kashti = sns.load_dataset("titanic")
sns.barplot(x="who",y="alone", data=kashti)
plt.title("titanic ka Plot")
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
kashti = sns.load_dataset("titanic")
sns.barplot(x="sex",y="alone", hue="who", data=kashti)
plt.title("titanic ka Plot")
plt.show()
## order setting
import seaborn as sns
import matplotlib.pyplot as plt
kashti = sns.load_dataset("titanic")
sns.barplot(x="sex",y="alone", hue="who", data=kashti, order=["female" ,"male"] )
plt.title("titanic ka Plot")
plt.show()
#changing color
## order setting
import seaborn as sns
import matplotlib.pyplot as plt
kashti = sns.load_dataset("titanic")
sns.barplot(x="sex",y="alone", hue="who", data=kashti, order=["female" ,"male"] ,color= "grey")
plt.title("titanic ka Plot")
plt.show()
#removing error bars
import seaborn as sns
import matplotlib.pyplot as plt
kashti = sns.load_dataset("titanic")
sns.barplot(x="sex",y="alone", hue="who", data=kashti, order=["female" ,"male"] ,ci= None)
plt.title("titanic ka Plot")
plt.show()
#using different paletts
import seaborn as sns
import matplotlib.pyplot as plt
kashti = sns.load_dataset("titanic")
sns.barplot(x="sex",y="alone", hue="who", data=kashti, order=["female" ,"male"] ,ci= None , palette= "pastel")
plt.title("titanic ka Plot")
plt.show()
#we can search for seaborne built in color palettes
# intensity of colors
import seaborn as sns
import matplotlib.pyplot as plt
kashti = sns.load_dataset("titanic")
sns.barplot(x="sex",y="alone", hue="who", data=kashti, order=["female" ,"male"] ,ci= None , saturation=.5)
plt.title("titanic ka Plot")
plt.show()
#HOrizontal Plot
#numeric parameter on x axis
import seaborn as sns
import matplotlib.pyplot as plt
kashti = sns.load_dataset("titanic")
sns.barplot(x="fare", y="class", hue="sex", data=kashti ,ci= None)
plt.title("titanic ka Plot")
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
kashti = sns.load_dataset("titanic")
sns.barplot(x="fare", y="class", hue="sex", data=kashti ,ci= None)
plt.title("titanic ka Plot")
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
kashti = sns.load_dataset("titanic")
sns.barplot(x="class", y="fare", data=kashti ,linewidth=2.5,facecolor=(1,1,1,1) , errcolor= "0.5" , edgecolor= "0.5" )
plt.title("titanic ka Plot")
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
#canvas (baloon board)
sns.set(style="whitegrid")
kashti= sns.load_dataset("titanic")
sns.boxplot(x="class",y="fare", data=kashti)
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="whitegrid")
tip= sns.load_dataset("tips")
tip
| total_bill | tip | sex | smoker | day | time | size | |
|---|---|---|---|---|---|---|---|
| 0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
| 1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
| 2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
| 3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
| 4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 239 | 29.03 | 5.92 | Male | No | Sat | Dinner | 3 |
| 240 | 27.18 | 2.00 | Female | Yes | Sat | Dinner | 2 |
| 241 | 22.67 | 2.00 | Male | Yes | Sat | Dinner | 2 |
| 242 | 17.82 | 1.75 | Male | No | Sat | Dinner | 2 |
| 243 | 18.78 | 3.00 | Female | No | Thur | Dinner | 2 |
244 rows × 7 columns
import seaborn as sns
import matplotlib.pyplot as plt
import numpy #estimator will not work here
sns.set(style="whitegrid")
tip= sns.load_dataset("tips")
sns.boxplot(x="day",y="tip", data=tip, saturation=1)
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
sns.set(style="whitegrid")
tip= sns.load_dataset("tips")
tip.describe()
| total_bill | tip | size | |
|---|---|---|---|
| count | 244.000000 | 244.000000 | 244.000000 |
| mean | 19.785943 | 2.998279 | 2.569672 |
| std | 8.902412 | 1.383638 | 0.951100 |
| min | 3.070000 | 1.000000 | 1.000000 |
| 25% | 13.347500 | 2.000000 | 2.000000 |
| 50% | 17.795000 | 2.900000 | 2.000000 |
| 75% | 24.127500 | 3.562500 | 3.000000 |
| max | 50.810000 | 10.000000 | 6.000000 |
# catagorical variable must be draw in x-axis or hue
# numeric variable in y-axis
# numerical variable should not be go to hue
#for single
import seaborn as sns
sns.set(style="whitegrid")
tip= sns.load_dataset("tips")
sns.boxplot(x=tip["tip"])
<AxesSubplot:xlabel='tip'>
import seaborn as sns
sns.set(style="whitegrid")
tip= sns.load_dataset("tips")
sns.boxplot(y=tip["total_bill"])
<AxesSubplot:ylabel='total_bill'>
import seaborn as sns
sns.set(style="whitegrid")
tip= sns.load_dataset("tips")
sns.boxplot(x="tip", y="day", data= tip)
<AxesSubplot:xlabel='tip', ylabel='day'>
import seaborn as sns
sns.set(style="whitegrid")
tip= sns.load_dataset("tips")
sns.boxplot(x="tip", y="day", hue="smoker" ,data= tip, palette ="Set2")
<AxesSubplot:xlabel='tip', ylabel='day'>
#dodge
import seaborn as sns
sns.set(style="whitegrid")
tip= sns.load_dataset("tips")
sns.boxplot(x="tip", y="day", hue="smoker" ,data= tip, palette ="Set2", dodge=True)
<AxesSubplot:xlabel='tip', ylabel='day'>
import seaborn as sns
sns.set(style="whitegrid")
tip= sns.load_dataset("tips")
sns.boxplot(x="tip", y="day", hue="smoker" ,data= tip, palette ="Set2", dodge=False)
<AxesSubplot:xlabel='tip', ylabel='day'>
import seaborn as sns
import matplotlib.pyplot as plt
import numpy
sns.set(style="whitegrid")
tip= sns.load_dataset("tips")
my_color= {"Yes":"#0d9ea8","No":"#6e0c11"}
sns.boxplot(x="tip",y="day", hue="smoker", data=tip, saturation=1 ,palette=my_color )
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
kashti= sns.load_dataset("titanic")
kashti.head()
| survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | deck | embark_town | alive | alone | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S | Third | man | True | NaN | Southampton | no | False |
| 1 | 1 | 1 | female | 38.0 | 1 | 0 | 71.2833 | C | First | woman | False | C | Cherbourg | yes | False |
| 2 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S | Third | woman | False | NaN | Southampton | yes | True |
| 3 | 1 | 1 | female | 35.0 | 1 | 0 | 53.1000 | S | First | woman | False | C | Southampton | yes | False |
| 4 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S | Third | man | True | NaN | Southampton | no | True |
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
kashti= sns.load_dataset("titanic")
sns.boxplot(x="survived", y="age", data= kashti)
<AxesSubplot:xlabel='survived', ylabel='age'>
kashti= sns.load_dataset("titanic")
p1 = sns.boxplot(x="survived", y="age", showmeans=True, data= kashti)
p1
<AxesSubplot:xlabel='survived', ylabel='age'>
kashti= sns.load_dataset("titanic")
p1 = sns.boxplot(x="survived", y="age", showmeans=True, meanprops={"marker":"+", "markersize":"12", "markeredgecolor":"red"} ,data= kashti)
p1
<AxesSubplot:xlabel='survived', ylabel='age'>
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
kashti= sns.load_dataset("titanic")
sns.boxplot(x="survived", y="age", showmeans=True, meanprops={"marker":"+", "markersize":"12", "markeredgecolor":"red"} ,data= kashti)
plt.title("Kitne doobay or kitne bachay" , size= 25, weight="bold")
plt.xlabel ("Kitnay bach gaey ", size= 16)
plt.ylabel ("umar kia hai", size = 16 )
Text(0, 0.5, 'umar kia hai')
#imporing libraries
import seaborn as sns
import matplotlib.pyplot as plt
import pandas
#setting canvas
sns.set(style="darkgrid")
sns.set_style(style= None , rc=None)
#loading dataset
tip= sns.load_dataset("tips")
#adjusting size of my final graph
plt.figure(figsize=(12,12))
#making our palette for diffining colors for each hue from hex color picker
my_color= {"Yes":"#0d9ea8","No":"#6e0c11"}
#plotting a graph
sns.boxplot(x="tip",y="day", hue="smoker", data=tip, saturation=1 ,palette=my_color )
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
#importing data set
my_data= pd.read_csv("new_data.csv")
my_data.head()
| Gender | Location | Age_range | Qualification | field_of_study | Purpose_for_chilla | Work_status | Blood_group | Mobile_sim | Sim_type | ... | Your favorite programming language? | Marital Status? | Are you Vaccinated? | Where do you live? | Working experience | Age | Weight_in_kg | Height_in_cm | How many hours you code a day? (int) | Light kitni der band hti hy? int | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Male | Pakistan | 36-40 | Masters | Natural Sciences | to boost my skill set | Unemplyed | B+ | U-fone | Prepaid | ... | Python | Yes | Yes | Urbun | 5.0 | 38.00 | 77.0 | 179.000 | 3.0 | 2 |
| 1 | Male | Pakistan | 26-30 | Bachelors | CS/IT | to boost my skill set | Student | B+ | U-fone | Prepaid | ... | Python | No | Yes | Urbun | 1.0 | 25.00 | 53.6 | 178.000 | 2.0 | 6 |
| 2 | Male | Pakistan | 31-35 | Masters | Enginnering | Switch my field of study | Employed | B+ | Zong | Prepaid | ... | Python | Yes | Yes | Urbun | 5.5 | 31.34 | 93.0 | 173.000 | 2.0 | 0 |
| 3 | Female | Pakistan | 31-35 | Masters | CS/IT | to boost my skill set | Employed | O+ | U-fone | Postpaid | ... | Python | Yes | Yes | Urbun | 5.0 | 33.00 | 60.0 | 157.000 | 3.0 | 24 |
| 4 | Female | Pakistan | 26-30 | Masters | Enginnering | to boost my skill set | Student | A- | Mobilink | Prepaid | ... | Javascript | No | Yes | Rural | 3.5 | 27.00 | 59.9 | 164.544 | 6.0 | 12 |
5 rows × 23 columns
#importing libraries
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
#importing data set
my_data= pd.read_csv("new_data.csv")
#setting canvas
sns.set(style="darkgrid")
#setting final figure size
plt.figure(figsize=(18,12))
sns.boxplot(x="Qualification", y="Weight_in_kg" ,hue="Gender", dodge= True ,data= my_data, saturation= 1, showmeans=True, meanprops={"marker":"+", "markersize":"15", "markeredgecolor":"red"})
plt.title("Kon Ziada Khata hai ?", size= 28, weight= "bold")
plt.show()
import plotly.express as px
import pandas as pd
data= pd.read_csv("new_data.csv")
fig = px.scatter(data, x="Weight_in_kg", y="Working experience", color="Gender", symbol="Marital Status?", facet_col="Qualification",
labels={"Gender": "Sex", "Marital Status?": "Married"})
fig.show()
import plotly.express as px
import pandas as pd
data= pd.read_csv("new_data.csv")
fig = px.scatter(data, x="Age", y="Working experience", color="Gender", symbol="Marital Status?", facet_col="Qualification",
labels={"Gender": "Sex", "Marital Status?": "Married"})
fig.show()
import plotly.express as px
import pandas as pd
data= pd.read_csv("new_data.csv")
fig = px.scatter(data, x="Weight_in_kg", y="Age", color="Location",
size="Working experience", size_max=45, log_x=True)
fig.update_layout(legend=dict(
orientation="h",
yanchor="bottom",
y=1,
xanchor="right",
x=1
))
fig.show()
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="darkgrid")
plt.figure(figsize=(10,10))
import pandas as pd
my_data= pd.read_csv("new_data.csv")
sns.boxenplot(x="Location", y="Weight_in_kg",
color="b",
scale="linear", data=my_data)
plt.title("Konsi Country k loog ziada khate hain" ,size=25 , weight="bold")
Text(0.5, 1.0, 'Konsi Country k loog ziada khate hain')
import plotly.express as px
import pandas as pd
data= pd.read_csv("new_data.csv")
fig = px.area(data, x="Age", y="Weight_in_kg", color="Qualification", line_group="Location")
fig.show()
import plotly.express as px
import pandas as pd
data= pd.read_csv("new_data.csv")
fig = px.sunburst(data, path=['Qualification', 'Location'], values='Age',
color='Working experience', hover_data=['Marital Status?'])
fig.show()
import plotly.express as px
import pandas as pd
data= pd.read_csv("new_data.csv")
fig = px.violin(data, y="Age", x="Marital Status?", color="Gender", box=True, points="all", hover_data=data.columns)
fig.show()
import plotly.express as px
import pandas as pd
data= pd.read_csv("new_data.csv")
fig = px.density_contour(data, x="Age", y="Weight_in_kg")
fig.show()
import plotly.express as px
import pandas as pd
data= pd.read_csv("new_data.csv")
fig = px.line_polar(data, r="Age", theta="Location", color="Qualification", line_close=True,
color_discrete_sequence=px.colors.sequential.Plasma_r)
fig.show()
import plotly.express as px
import pandas as pd
data= pd.read_csv("new_data.csv")
fig = px.scatter(data, x="Weight_in_kg", y="Height_in_cm", color="Qualification", marginal_y="violin",
marginal_x="box", trendline="ols", template="simple_white", labels={"Height_in_cm" : "Height", "Weight_in_kg": "Weight", "Male": "Munda" })
fig.show()
import numpy as np
a= np.array([1,2,3,4,5])
a
array([1, 2, 3, 4, 5])
type(a)
numpy.ndarray
len(a)
5
# creating a single axis array of number zero
c= np.zeros(2)
c
array([0., 0.])
# creating a single axis array of number One
d= np.ones(10)
d
array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])
e=np.empty(3)
e
array([9.96959917e-312, 0.00000000e+000, 4.94065646e-324])
# with the specific range of elements
g= np.arange(5,15) # as we arleady know the last no. is exclusive
g
array([ 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
# with the range of elements with a speciefied gap
h=np.arange(2,20,2) # goin from 2 to 20 with a specified gap of 2 and last no. is exlcusive
h
array([ 2, 4, 6, 8, 10, 12, 14, 16, 18])
# and if we want 20 no. also in last arange
h=np.arange(2,21,2)
h
array([ 2, 4, 6, 8, 10, 12, 14, 16, 18, 20])
# Linearly spaced arrays
i= np.linspace(0,15 , num= 5) # GOing 0 to 15 in just 5 numbers in a way that the distance between each no. will remain same
i
array([ 0. , 3.75, 7.5 , 11.25, 15. ])
j= np.ones(5, dtype=np.float64)
j
array([1., 1., 1., 1., 1.])
b= np.array([[2,2,2,2],[3,3,3,3]])
b
array([[2, 2, 2, 2],
[3, 3, 3, 3]])
e= np.array([[1,1,1,1],[2,2,2,2]])
e
array([[1, 1, 1, 1],
[2, 2, 2, 2]])
k=np.zeros((3,4))
k
array([[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.]])
l=np.zeros((5,6))
l
array([[0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0.]])
m=np.ones((2,4))
m
array([[1., 1., 1., 1.],
[1., 1., 1., 1.]])
d= np.array([[2,2,2],[2,2,2],[6,5,4]])
d
array([[2, 2, 2],
[2, 2, 2],
[6, 5, 4]])
f= np.array ([[4,5,6,4],[8,6,4,5],[8,6,4,2],[9,6,3,2]])
f
array([[4, 5, 6, 4],
[8, 6, 4, 5],
[8, 6, 4, 2],
[9, 6, 3, 2]])
g= np.array([[1,2,3],[2,3,4],[4,5,6]])
g
array([[1, 2, 3],
[2, 3, 4],
[4, 5, 6]])
# TensorFlow is a library use for 3 Dimensional things
# TensorFlow is also a free and open source software libraryfor machine learning and artificial intelligence
#making and reshaping a 3D array
c= np.arange(24) .reshape (2,3,4) # First axis has length = 2,,Second Axis has length = 3 , Third axis has length = 4
c
array([[[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11]],
[[12, 13, 14, 15],
[16, 17, 18, 19],
[20, 21, 22, 23]]])
d= np.zeros((2,3,3))
d
array([[[0., 0., 0.],
[0., 0., 0.],
[0., 0., 0.]],
[[0., 0., 0.],
[0., 0., 0.],
[0., 0., 0.]]])
f= np.ones((3,4,5) , dtype= np.int64)
f
array([[[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1]],
[[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1]],
[[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1]]], dtype=int64)
f= np.array ([[[4,5,6,4],[8,6,4,5],[8,6,4,2],[9,6,3,2]],[[4,5,6,4],[8,6,4,5],[8,6,4,2],[9,6,3,2]],[[4,5,6,4],[8,6,4,5],[8,6,4,2],[9,6,3,2]]])
f
array([[[4, 5, 6, 4],
[8, 6, 4, 5],
[8, 6, 4, 2],
[9, 6, 3, 2]],
[[4, 5, 6, 4],
[8, 6, 4, 5],
[8, 6, 4, 2],
[9, 6, 3, 2]],
[[4, 5, 6, 4],
[8, 6, 4, 5],
[8, 6, 4, 2],
[9, 6, 3, 2]]])
z= np.array ([[[1,2,3],[4,5,6]],[[7,8,9],[10,11,12]],[[13,14,15],[16,17,18]]])
z
array([[[ 1, 2, 3],
[ 4, 5, 6]],
[[ 7, 8, 9],
[10, 11, 12]],
[[13, 14, 15],
[16, 17, 18]]])
#importing numpy library
import numpy as np
import numpy as np
food= np.array(["pakora" , "samosa" , "raita"])
food
array(['pakora', 'samosa', 'raita'], dtype='<U6')
price = np.array([5,5,5])
price
array([5, 5, 5])
#Checking type of array
type(price)
numpy.ndarray
type(food)
numpy.ndarray
#length of array
len(food)
3
#indexing
price[2]
5
price[0:]
array([5, 5, 5])
#index no to find the index in an array
food[1]
'samosa'
price.mean()
5.0
# zeros method
a= np.zeros(6)
a
array([0., 0., 0., 0., 0., 0.])
# ones method
b= np.ones(5)
b
array([1., 1., 1., 1., 1.])
c= np.empty(5)
c
array([1., 1., 1., 1., 1.])
# Making a Range
a= np.arange(10)
a
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
# Specified range
a= np.arange(2,21)
a
array([ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
19, 20])
# specific arang with specific distance
a= np.arange (2,20,3)
a
array([ 2, 5, 8, 11, 14, 17])
#table of 5
a= np.arange (5,55,5)
a
array([ 5, 10, 15, 20, 25, 30, 35, 40, 45, 50])
# with line space
a= np.linspace (0,10, num = 6 ,dtype= np.int64 )
a
array([ 0, 2, 4, 6, 8, 10], dtype=int64)
b= np.linspace(1,100, num = 40)
b
array([ 1. , 3.53846154, 6.07692308, 8.61538462,
11.15384615, 13.69230769, 16.23076923, 18.76923077,
21.30769231, 23.84615385, 26.38461538, 28.92307692,
31.46153846, 34. , 36.53846154, 39.07692308,
41.61538462, 44.15384615, 46.69230769, 49.23076923,
51.76923077, 54.30769231, 56.84615385, 59.38461538,
61.92307692, 64.46153846, 67. , 69.53846154,
72.07692308, 74.61538462, 77.15384615, 79.69230769,
82.23076923, 84.76923077, 87.30769231, 89.84615385,
92.38461538, 94.92307692, 97.46153846, 100. ])
# specifing the data type
a= np.ones(5, dtype= np.int8)
a
array([1, 1, 1, 1, 1], dtype=int8)
a= np.ones(50, dtype= np.float64)
a
array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])
a= np.array([ 10,12,15,2,4,6,18,100,18,16,10.3,0.5])
a
array([ 10. , 12. , 15. , 2. , 4. , 6. , 18. , 100. , 18. ,
16. , 10.3, 0.5])
#sorting an array
a.sort()
a
array([ 0.5, 2. , 4. , 6. , 10. , 10.3, 12. , 15. , 16. ,
18. , 18. , 100. ])
b= np.array([10.5,5,15.6,8,0.5,10.5,100.9,15,16,59])
b
array([ 10.5, 5. , 15.6, 8. , 0.5, 10.5, 100.9, 15. , 16. ,
59. ])
c= np.concatenate((a,b))
c
array([ 0.5, 2. , 4. , 6. , 10. , 10.3, 12. , 15. , 16. ,
18. , 18. , 100. , 10.5, 5. , 15.6, 8. , 0.5, 10.5,
100.9, 15. , 16. , 59. ])
c.sort()
c
array([ 0.5, 0.5, 2. , 4. , 5. , 6. , 8. , 10. , 10.3,
10.5, 10.5, 12. , 15. , 15. , 15.6, 16. , 16. , 18. ,
18. , 59. , 100. , 100.9])
a = np.array ([[1,2,3],[2,6,5]])
a
array([[1, 2, 3],
[2, 6, 5]])
b = np.array ([[3,6,5],[6,8,9]])
b
array([[3, 6, 5],
[6, 8, 9]])
#checking the shape of matrix
b.shape
(2, 3)
c= np.concatenate((a,b) ,axis= 1)
c
array([[1, 2, 3, 3, 6, 5],
[2, 6, 5, 6, 8, 9]])
c= np.concatenate((a,b) ,axis= 0)
c
array([[1, 2, 3],
[2, 6, 5],
[3, 6, 5],
[6, 8, 9]])
c.shape
(4, 3)
a= np.array ([[['a','b','c'],['e','d','f']],
[['a','b','c'],['e','d','f']],
[['a','b','c'],['e','d','f']]])
a
array([[['a', 'b', 'c'],
['e', 'd', 'f']],
[['a', 'b', 'c'],
['e', 'd', 'f']],
[['a', 'b', 'c'],
['e', 'd', 'f']]], dtype='<U1')
#finding a no. of dimensions
a.ndim
3
a.size
18
# shape of array
a.shape
(3, 2, 3)
b= np.array ([[[1,2,3],[7,8,9],[9,6,3]],
[[1,2,3],[7,8,9],[9,6,3]],
[[1,2,3],[7,8,9],[9,6,3]]])
b
array([[[1, 2, 3],
[7, 8, 9],
[9, 6, 3]],
[[1, 2, 3],
[7, 8, 9],
[9, 6, 3]],
[[1, 2, 3],
[7, 8, 9],
[9, 6, 3]]])
b.ndim
3
type(a)
numpy.ndarray
b.shape
(3, 3, 3)
b.size
27
a= np.arange(9)
a
array([0, 1, 2, 3, 4, 5, 6, 7, 8])
a.reshape (3,3) #3*3=9 (9 indexes are there in array )
array([[0, 1, 2],
[3, 4, 5],
[6, 7, 8]])
a.shape
(9,)
#row wise conversion
b= a[np.newaxis,:]
b
array([[0, 1, 2, 3, 4, 5, 6, 7, 8]])
b.shape
(1, 9)
#coloumn wise conversion
b= a[:, np.newaxis]
b
array([[0],
[1],
[2],
[3],
[4],
[5],
[6],
[7],
[8]])
b.shape
(9, 1)
c= np.arange(9)
c
array([0, 1, 2, 3, 4, 5, 6, 7, 8])
c.shape
(9,)
d=c[np.newaxis, :]
d
array([[0, 1, 2, 3, 4, 5, 6, 7, 8]])
d.shape
(1, 9)
a
array([0, 1, 2, 3, 4, 5, 6, 7, 8])
a[2]
2
a[0:5]
array([0, 1, 2, 3, 4])
a*6
array([ 0, 6, 12, 18, 24, 30, 36, 42, 48])
a+6
array([ 6, 7, 8, 9, 10, 11, 12, 13, 14])
a.sum()
36
a.mean()
4.0
a.max()
8
a.min()
0
#importing libraries
import pandas as pd
import numpy as np
# object creation
s= pd.Series([1,2,np.nan ,5,7,8,9])
s
0 1.0 1 2.0 2 NaN 3 5.0 4 7.0 5 8.0 6 9.0 dtype: float64
dates = pd.date_range("20220101", periods=9)
dates
DatetimeIndex(['2022-01-01', '2022-01-02', '2022-01-03', '2022-01-04',
'2022-01-05', '2022-01-06', '2022-01-07', '2022-01-08',
'2022-01-09'],
dtype='datetime64[ns]', freq='D')
dates = pd.date_range("20220101", periods=33)
dates
DatetimeIndex(['2022-01-01', '2022-01-02', '2022-01-03', '2022-01-04',
'2022-01-05', '2022-01-06', '2022-01-07', '2022-01-08',
'2022-01-09', '2022-01-10', '2022-01-11', '2022-01-12',
'2022-01-13', '2022-01-14', '2022-01-15', '2022-01-16',
'2022-01-17', '2022-01-18', '2022-01-19', '2022-01-20',
'2022-01-21', '2022-01-22', '2022-01-23', '2022-01-24',
'2022-01-25', '2022-01-26', '2022-01-27', '2022-01-28',
'2022-01-29', '2022-01-30', '2022-01-31', '2022-02-01',
'2022-02-02'],
dtype='datetime64[ns]', freq='D')
df= pd.DataFrame(np.random.randn(33,5), index= dates, columns= list("ABCDE"))
df
| A | B | C | D | E | |
|---|---|---|---|---|---|
| 2022-01-01 | -1.424660 | 0.772210 | 0.783466 | -0.712659 | -2.551065 |
| 2022-01-02 | 1.382429 | -1.708730 | 1.176728 | 1.949761 | -0.492011 |
| 2022-01-03 | 0.325283 | -0.834825 | -0.879866 | -0.137038 | 0.919688 |
| 2022-01-04 | 0.552770 | -0.454804 | -0.747360 | 1.800993 | -0.275458 |
| 2022-01-05 | -0.896864 | -0.958084 | 0.579021 | 0.478870 | 0.260666 |
| 2022-01-06 | -3.103231 | -0.340718 | 0.050150 | 0.058931 | -1.712098 |
| 2022-01-07 | 1.769682 | -1.402604 | -2.110711 | -0.922010 | -0.753877 |
| 2022-01-08 | 0.660826 | 1.017961 | 0.569261 | -1.209834 | 1.056544 |
| 2022-01-09 | 0.125091 | -0.340940 | 1.023370 | -1.555463 | 0.322947 |
| 2022-01-10 | 1.915190 | 0.314069 | -0.638422 | -1.023607 | -1.544831 |
| 2022-01-11 | -0.637742 | -1.434976 | -1.276443 | -1.153942 | 0.761682 |
| 2022-01-12 | -0.742913 | -1.484980 | -0.917490 | -0.283180 | -0.053965 |
| 2022-01-13 | 0.201580 | 0.786547 | -1.459392 | 0.126138 | -0.312369 |
| 2022-01-14 | 1.321449 | 0.050355 | -0.933186 | -1.043294 | 1.676111 |
| 2022-01-15 | -0.436737 | -0.262544 | 0.686336 | 0.144395 | 0.625462 |
| 2022-01-16 | -0.152087 | 0.789873 | -1.096439 | -0.172554 | -1.109436 |
| 2022-01-17 | 0.379340 | -0.193847 | -1.390857 | -1.056517 | -0.158073 |
| 2022-01-18 | 0.785079 | -1.145713 | 0.233175 | -0.294474 | -0.493845 |
| 2022-01-19 | -1.454019 | 0.974247 | -0.442348 | 0.044247 | 1.190501 |
| 2022-01-20 | 0.503909 | -0.776441 | -0.323965 | 0.813752 | -0.409261 |
| 2022-01-21 | -0.270772 | 0.698006 | -0.801287 | -0.182554 | 0.852357 |
| 2022-01-22 | -0.528683 | 0.258139 | 0.589976 | 0.325295 | 1.688333 |
| 2022-01-23 | 0.240170 | -0.518060 | 0.402815 | -1.852271 | -0.960517 |
| 2022-01-24 | 0.615130 | -0.661989 | 1.052420 | 0.529098 | -2.626316 |
| 2022-01-25 | -0.344456 | 1.276411 | -0.625939 | -1.117180 | -0.680828 |
| 2022-01-26 | -0.073498 | 0.611255 | 1.133547 | 0.358266 | -0.694200 |
| 2022-01-27 | -1.981745 | 0.243877 | -0.122814 | 0.343524 | 1.155793 |
| 2022-01-28 | -0.023202 | 0.546315 | -0.150933 | -0.316904 | 0.220736 |
| 2022-01-29 | 0.201237 | 0.241144 | 1.538946 | -0.113433 | 2.243980 |
| 2022-01-30 | 0.073743 | -0.597354 | -0.399961 | 0.624458 | -0.305179 |
| 2022-01-31 | -0.752144 | -0.738474 | 0.248588 | 0.375347 | 0.729071 |
| 2022-02-01 | 0.635539 | 0.457294 | -0.037721 | -0.169999 | -0.373402 |
| 2022-02-02 | 1.761750 | 0.542268 | 0.059822 | 0.817176 | 0.607860 |
df2= pd.DataFrame(
{
"A" : 1.0,
"B": pd.Timestamp("20130102"),
"C": pd.Series(1, index= list(range(4)), dtype="float32"),
"D": np.array([3]*4, dtype= "int32"),
"E": pd.Categorical(["test","train","test","train"]),
"F": "foo"
}
)
df2
| A | B | C | D | E | F | |
|---|---|---|---|---|---|---|
| 0 | 1.0 | 2013-01-02 | 1.0 | 3 | test | foo |
| 1 | 1.0 | 2013-01-02 | 1.0 | 3 | train | foo |
| 2 | 1.0 | 2013-01-02 | 1.0 | 3 | test | foo |
| 3 | 1.0 | 2013-01-02 | 1.0 | 3 | train | foo |
df2.dtypes
A float64 B datetime64[ns] C float32 D int32 E category F object dtype: object
df2.head(2)
| A | B | C | D | E | F | |
|---|---|---|---|---|---|---|
| 0 | 1.0 | 2013-01-02 | 1.0 | 3 | test | foo |
| 1 | 1.0 | 2013-01-02 | 1.0 | 3 | train | foo |
df.tail(2)
| A | B | C | D | E | |
|---|---|---|---|---|---|
| 2022-02-01 | 0.635539 | 0.457294 | -0.037721 | -0.169999 | -0.373402 |
| 2022-02-02 | 1.761750 | 0.542268 | 0.059822 | 0.817176 | 0.607860 |
df2.index
Int64Index([0, 1, 2, 3], dtype='int64')
dates1 = pd.date_range("20220101", periods=20)
dates1
DatetimeIndex(['2022-01-01', '2022-01-02', '2022-01-03', '2022-01-04',
'2022-01-05', '2022-01-06', '2022-01-07', '2022-01-08',
'2022-01-09', '2022-01-10', '2022-01-11', '2022-01-12',
'2022-01-13', '2022-01-14', '2022-01-15', '2022-01-16',
'2022-01-17', '2022-01-18', '2022-01-19', '2022-01-20'],
dtype='datetime64[ns]', freq='D')
df1= pd.DataFrame(np.random.randn(20,5), index= dates1, columns= list("ABCDE"))
df1
| A | B | C | D | E | |
|---|---|---|---|---|---|
| 2022-01-01 | 0.388135 | -0.694500 | 1.710742 | -0.953853 | 1.647141 |
| 2022-01-02 | -0.517575 | -0.188147 | 0.630036 | -0.881991 | -0.981272 |
| 2022-01-03 | -1.674983 | -0.586637 | -1.340947 | 0.702661 | -0.929540 |
| 2022-01-04 | 0.581906 | -0.080027 | 1.137075 | 0.775445 | -1.796040 |
| 2022-01-05 | -1.273825 | 0.312914 | -1.165314 | -0.169640 | -0.368941 |
| 2022-01-06 | 0.515602 | 0.490068 | -0.545239 | -0.408955 | 1.333627 |
| 2022-01-07 | 0.169663 | -1.019295 | -0.668360 | -2.786986 | -0.199359 |
| 2022-01-08 | -0.130497 | 0.102798 | 0.288203 | -0.291382 | -0.364559 |
| 2022-01-09 | -0.369015 | 1.283904 | 0.839903 | -0.279453 | -1.272016 |
| 2022-01-10 | -0.814083 | -0.705237 | -1.627397 | -2.385590 | -0.144798 |
| 2022-01-11 | -1.542745 | -0.536425 | 0.484937 | -0.486977 | 0.237764 |
| 2022-01-12 | -0.882890 | 0.175792 | -0.131634 | 0.699205 | 0.792908 |
| 2022-01-13 | 1.235072 | 0.640500 | -1.283168 | -1.422045 | -0.402504 |
| 2022-01-14 | -0.085713 | 0.117496 | 2.047838 | 0.240406 | 0.784371 |
| 2022-01-15 | 1.157494 | -0.128045 | -0.278789 | 0.296545 | 0.371711 |
| 2022-01-16 | -0.884580 | -1.623248 | 0.492578 | 0.088836 | -0.049226 |
| 2022-01-17 | -1.324278 | -0.765477 | 0.907903 | -0.808565 | 0.004943 |
| 2022-01-18 | 1.681379 | -0.302969 | -1.158782 | 1.267730 | -2.336489 |
| 2022-01-19 | 0.821827 | 1.307620 | -0.502119 | -1.896293 | -0.194628 |
| 2022-01-20 | -0.875702 | 0.433110 | 0.883599 | 0.561136 | -0.466486 |
a=df1.to_numpy()
array([[ 0.38813495, -0.69450026, 1.71074173, -0.95385292, 1.64714073],
[-0.51757465, -0.18814687, 0.63003587, -0.88199128, -0.98127153],
[-1.67498296, -0.58663723, -1.34094674, 0.70266092, -0.92954028],
[ 0.58190588, -0.08002672, 1.13707543, 0.77544498, -1.79604041],
[-1.27382455, 0.31291439, -1.16531406, -0.1696396 , -0.36894076],
[ 0.51560227, 0.49006838, -0.54523928, -0.40895489, 1.33362693],
[ 0.16966288, -1.01929506, -0.66836035, -2.78698617, -0.19935927],
[-0.13049661, 0.1027985 , 0.28820314, -0.29138204, -0.36455922],
[-0.36901525, 1.28390443, 0.83990284, -0.27945273, -1.27201643],
[-0.81408326, -0.70523725, -1.62739717, -2.38558997, -0.14479825],
[-1.54274494, -0.53642495, 0.48493728, -0.48697742, 0.2377637 ],
[-0.88289022, 0.17579184, -0.13163351, 0.69920486, 0.79290814],
[ 1.23507203, 0.64050031, -1.28316767, -1.42204502, -0.40250438],
[-0.08571273, 0.1174964 , 2.04783829, 0.2404063 , 0.7843707 ],
[ 1.15749355, -0.12804461, -0.27878909, 0.29654452, 0.37171123],
[-0.88458023, -1.62324761, 0.4925782 , 0.08883562, -0.04922586],
[-1.32427812, -0.76547659, 0.90790288, -0.80856471, 0.00494294],
[ 1.68137876, -0.30296923, -1.15878239, 1.26773 , -2.33648888],
[ 0.82182693, 1.30762029, -0.50211862, -1.8962927 , -0.19462804],
[-0.87570223, 0.43311031, 0.88359855, 0.56113551, -0.46648558]])
a.shape
(9,)
df2.to_numpy()
array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
dtype=object)
#details of data
df1.describe()
| A | B | C | D | E | |
|---|---|---|---|---|---|
| count | 20.000000 | 20.000000 | 20.000000 | 20.000000 | 20.000000 |
| mean | -0.191240 | -0.088290 | 0.036053 | -0.406988 | -0.216670 |
| std | 0.984138 | 0.733233 | 1.064758 | 1.084846 | 0.969833 |
| min | -1.674983 | -1.623248 | -1.627397 | -2.786986 | -2.336489 |
| 25% | -0.883313 | -0.613603 | -0.790966 | -0.899957 | -0.582249 |
| 50% | -0.249756 | -0.104036 | 0.078285 | -0.285417 | -0.196994 |
| 75% | 0.532178 | 0.342963 | 0.850827 | 0.362692 | 0.271251 |
| max | 1.681379 | 1.307620 | 2.047838 | 1.267730 | 1.647141 |
#to transpose the data
df2.T
| 0 | 1 | 2 | 3 | |
|---|---|---|---|---|
| A | 1.0 | 1.0 | 1.0 | 1.0 |
| B | 2013-01-02 00:00:00 | 2013-01-02 00:00:00 | 2013-01-02 00:00:00 | 2013-01-02 00:00:00 |
| C | 1.0 | 1.0 | 1.0 | 1.0 |
| D | 3 | 3 | 3 | 3 |
| E | test | train | test | train |
| F | foo | foo | foo | foo |
# Sorting
df1.sort_index(axis=0, ascending=False)
| A | B | C | D | E | |
|---|---|---|---|---|---|
| 2022-01-20 | -0.875702 | 0.433110 | 0.883599 | 0.561136 | -0.466486 |
| 2022-01-19 | 0.821827 | 1.307620 | -0.502119 | -1.896293 | -0.194628 |
| 2022-01-18 | 1.681379 | -0.302969 | -1.158782 | 1.267730 | -2.336489 |
| 2022-01-17 | -1.324278 | -0.765477 | 0.907903 | -0.808565 | 0.004943 |
| 2022-01-16 | -0.884580 | -1.623248 | 0.492578 | 0.088836 | -0.049226 |
| 2022-01-15 | 1.157494 | -0.128045 | -0.278789 | 0.296545 | 0.371711 |
| 2022-01-14 | -0.085713 | 0.117496 | 2.047838 | 0.240406 | 0.784371 |
| 2022-01-13 | 1.235072 | 0.640500 | -1.283168 | -1.422045 | -0.402504 |
| 2022-01-12 | -0.882890 | 0.175792 | -0.131634 | 0.699205 | 0.792908 |
| 2022-01-11 | -1.542745 | -0.536425 | 0.484937 | -0.486977 | 0.237764 |
| 2022-01-10 | -0.814083 | -0.705237 | -1.627397 | -2.385590 | -0.144798 |
| 2022-01-09 | -0.369015 | 1.283904 | 0.839903 | -0.279453 | -1.272016 |
| 2022-01-08 | -0.130497 | 0.102798 | 0.288203 | -0.291382 | -0.364559 |
| 2022-01-07 | 0.169663 | -1.019295 | -0.668360 | -2.786986 | -0.199359 |
| 2022-01-06 | 0.515602 | 0.490068 | -0.545239 | -0.408955 | 1.333627 |
| 2022-01-05 | -1.273825 | 0.312914 | -1.165314 | -0.169640 | -0.368941 |
| 2022-01-04 | 0.581906 | -0.080027 | 1.137075 | 0.775445 | -1.796040 |
| 2022-01-03 | -1.674983 | -0.586637 | -1.340947 | 0.702661 | -0.929540 |
| 2022-01-02 | -0.517575 | -0.188147 | 0.630036 | -0.881991 | -0.981272 |
| 2022-01-01 | 0.388135 | -0.694500 | 1.710742 | -0.953853 | 1.647141 |
df.sort_values(by="B")
| A | B | C | D | E | |
|---|---|---|---|---|---|
| 2022-01-02 | 1.382429 | -1.708730 | 1.176728 | 1.949761 | -0.492011 |
| 2022-01-12 | -0.742913 | -1.484980 | -0.917490 | -0.283180 | -0.053965 |
| 2022-01-11 | -0.637742 | -1.434976 | -1.276443 | -1.153942 | 0.761682 |
| 2022-01-07 | 1.769682 | -1.402604 | -2.110711 | -0.922010 | -0.753877 |
| 2022-01-18 | 0.785079 | -1.145713 | 0.233175 | -0.294474 | -0.493845 |
| 2022-01-05 | -0.896864 | -0.958084 | 0.579021 | 0.478870 | 0.260666 |
| 2022-01-03 | 0.325283 | -0.834825 | -0.879866 | -0.137038 | 0.919688 |
| 2022-01-20 | 0.503909 | -0.776441 | -0.323965 | 0.813752 | -0.409261 |
| 2022-01-31 | -0.752144 | -0.738474 | 0.248588 | 0.375347 | 0.729071 |
| 2022-01-24 | 0.615130 | -0.661989 | 1.052420 | 0.529098 | -2.626316 |
| 2022-01-30 | 0.073743 | -0.597354 | -0.399961 | 0.624458 | -0.305179 |
| 2022-01-23 | 0.240170 | -0.518060 | 0.402815 | -1.852271 | -0.960517 |
| 2022-01-04 | 0.552770 | -0.454804 | -0.747360 | 1.800993 | -0.275458 |
| 2022-01-09 | 0.125091 | -0.340940 | 1.023370 | -1.555463 | 0.322947 |
| 2022-01-06 | -3.103231 | -0.340718 | 0.050150 | 0.058931 | -1.712098 |
| 2022-01-15 | -0.436737 | -0.262544 | 0.686336 | 0.144395 | 0.625462 |
| 2022-01-17 | 0.379340 | -0.193847 | -1.390857 | -1.056517 | -0.158073 |
| 2022-01-14 | 1.321449 | 0.050355 | -0.933186 | -1.043294 | 1.676111 |
| 2022-01-29 | 0.201237 | 0.241144 | 1.538946 | -0.113433 | 2.243980 |
| 2022-01-27 | -1.981745 | 0.243877 | -0.122814 | 0.343524 | 1.155793 |
| 2022-01-22 | -0.528683 | 0.258139 | 0.589976 | 0.325295 | 1.688333 |
| 2022-01-10 | 1.915190 | 0.314069 | -0.638422 | -1.023607 | -1.544831 |
| 2022-02-01 | 0.635539 | 0.457294 | -0.037721 | -0.169999 | -0.373402 |
| 2022-02-02 | 1.761750 | 0.542268 | 0.059822 | 0.817176 | 0.607860 |
| 2022-01-28 | -0.023202 | 0.546315 | -0.150933 | -0.316904 | 0.220736 |
| 2022-01-26 | -0.073498 | 0.611255 | 1.133547 | 0.358266 | -0.694200 |
| 2022-01-21 | -0.270772 | 0.698006 | -0.801287 | -0.182554 | 0.852357 |
| 2022-01-01 | -1.424660 | 0.772210 | 0.783466 | -0.712659 | -2.551065 |
| 2022-01-13 | 0.201580 | 0.786547 | -1.459392 | 0.126138 | -0.312369 |
| 2022-01-16 | -0.152087 | 0.789873 | -1.096439 | -0.172554 | -1.109436 |
| 2022-01-19 | -1.454019 | 0.974247 | -0.442348 | 0.044247 | 1.190501 |
| 2022-01-08 | 0.660826 | 1.017961 | 0.569261 | -1.209834 | 1.056544 |
| 2022-01-25 | -0.344456 | 1.276411 | -0.625939 | -1.117180 | -0.680828 |
df.sort_values(by="B",ascending=False)
| A | B | C | D | E | |
|---|---|---|---|---|---|
| 2022-01-25 | -0.344456 | 1.276411 | -0.625939 | -1.117180 | -0.680828 |
| 2022-01-08 | 0.660826 | 1.017961 | 0.569261 | -1.209834 | 1.056544 |
| 2022-01-19 | -1.454019 | 0.974247 | -0.442348 | 0.044247 | 1.190501 |
| 2022-01-16 | -0.152087 | 0.789873 | -1.096439 | -0.172554 | -1.109436 |
| 2022-01-13 | 0.201580 | 0.786547 | -1.459392 | 0.126138 | -0.312369 |
| 2022-01-01 | -1.424660 | 0.772210 | 0.783466 | -0.712659 | -2.551065 |
| 2022-01-21 | -0.270772 | 0.698006 | -0.801287 | -0.182554 | 0.852357 |
| 2022-01-26 | -0.073498 | 0.611255 | 1.133547 | 0.358266 | -0.694200 |
| 2022-01-28 | -0.023202 | 0.546315 | -0.150933 | -0.316904 | 0.220736 |
| 2022-02-02 | 1.761750 | 0.542268 | 0.059822 | 0.817176 | 0.607860 |
| 2022-02-01 | 0.635539 | 0.457294 | -0.037721 | -0.169999 | -0.373402 |
| 2022-01-10 | 1.915190 | 0.314069 | -0.638422 | -1.023607 | -1.544831 |
| 2022-01-22 | -0.528683 | 0.258139 | 0.589976 | 0.325295 | 1.688333 |
| 2022-01-27 | -1.981745 | 0.243877 | -0.122814 | 0.343524 | 1.155793 |
| 2022-01-29 | 0.201237 | 0.241144 | 1.538946 | -0.113433 | 2.243980 |
| 2022-01-14 | 1.321449 | 0.050355 | -0.933186 | -1.043294 | 1.676111 |
| 2022-01-17 | 0.379340 | -0.193847 | -1.390857 | -1.056517 | -0.158073 |
| 2022-01-15 | -0.436737 | -0.262544 | 0.686336 | 0.144395 | 0.625462 |
| 2022-01-06 | -3.103231 | -0.340718 | 0.050150 | 0.058931 | -1.712098 |
| 2022-01-09 | 0.125091 | -0.340940 | 1.023370 | -1.555463 | 0.322947 |
| 2022-01-04 | 0.552770 | -0.454804 | -0.747360 | 1.800993 | -0.275458 |
| 2022-01-23 | 0.240170 | -0.518060 | 0.402815 | -1.852271 | -0.960517 |
| 2022-01-30 | 0.073743 | -0.597354 | -0.399961 | 0.624458 | -0.305179 |
| 2022-01-24 | 0.615130 | -0.661989 | 1.052420 | 0.529098 | -2.626316 |
| 2022-01-31 | -0.752144 | -0.738474 | 0.248588 | 0.375347 | 0.729071 |
| 2022-01-20 | 0.503909 | -0.776441 | -0.323965 | 0.813752 | -0.409261 |
| 2022-01-03 | 0.325283 | -0.834825 | -0.879866 | -0.137038 | 0.919688 |
| 2022-01-05 | -0.896864 | -0.958084 | 0.579021 | 0.478870 | 0.260666 |
| 2022-01-18 | 0.785079 | -1.145713 | 0.233175 | -0.294474 | -0.493845 |
| 2022-01-07 | 1.769682 | -1.402604 | -2.110711 | -0.922010 | -0.753877 |
| 2022-01-11 | -0.637742 | -1.434976 | -1.276443 | -1.153942 | 0.761682 |
| 2022-01-12 | -0.742913 | -1.484980 | -0.917490 | -0.283180 | -0.053965 |
| 2022-01-02 | 1.382429 | -1.708730 | 1.176728 | 1.949761 | -0.492011 |
df1["A"]
2022-01-01 0.388135 2022-01-02 -0.517575 2022-01-03 -1.674983 2022-01-04 0.581906 2022-01-05 -1.273825 2022-01-06 0.515602 2022-01-07 0.169663 2022-01-08 -0.130497 2022-01-09 -0.369015 2022-01-10 -0.814083 2022-01-11 -1.542745 2022-01-12 -0.882890 2022-01-13 1.235072 2022-01-14 -0.085713 2022-01-15 1.157494 2022-01-16 -0.884580 2022-01-17 -1.324278 2022-01-18 1.681379 2022-01-19 0.821827 2022-01-20 -0.875702 Freq: D, Name: A, dtype: float64
#filteration data with coloum wise or indexwise
df1["B"]
2022-01-01 -0.694500 2022-01-02 -0.188147 2022-01-03 -0.586637 2022-01-04 -0.080027 2022-01-05 0.312914 2022-01-06 0.490068 2022-01-07 -1.019295 2022-01-08 0.102798 2022-01-09 1.283904 2022-01-10 -0.705237 2022-01-11 -0.536425 2022-01-12 0.175792 2022-01-13 0.640500 2022-01-14 0.117496 2022-01-15 -0.128045 2022-01-16 -1.623248 2022-01-17 -0.765477 2022-01-18 -0.302969 2022-01-19 1.307620 2022-01-20 0.433110 Freq: D, Name: B, dtype: float64
# TO select data row wise
df[0:1]
| A | B | C | D | E | |
|---|---|---|---|---|---|
| 2022-01-01 | -1.42466 | 0.77221 | 0.783466 | -0.712659 | -2.551065 |
df[0:2]
| A | B | C | D | E | |
|---|---|---|---|---|---|
| 2022-01-01 | -1.424660 | 0.77221 | 0.783466 | -0.712659 | -2.551065 |
| 2022-01-02 | 1.382429 | -1.70873 | 1.176728 | 1.949761 | -0.492011 |
df[0:10]
| A | B | C | D | E | |
|---|---|---|---|---|---|
| 2022-01-01 | -1.424660 | 0.772210 | 0.783466 | -0.712659 | -2.551065 |
| 2022-01-02 | 1.382429 | -1.708730 | 1.176728 | 1.949761 | -0.492011 |
| 2022-01-03 | 0.325283 | -0.834825 | -0.879866 | -0.137038 | 0.919688 |
| 2022-01-04 | 0.552770 | -0.454804 | -0.747360 | 1.800993 | -0.275458 |
| 2022-01-05 | -0.896864 | -0.958084 | 0.579021 | 0.478870 | 0.260666 |
| 2022-01-06 | -3.103231 | -0.340718 | 0.050150 | 0.058931 | -1.712098 |
| 2022-01-07 | 1.769682 | -1.402604 | -2.110711 | -0.922010 | -0.753877 |
| 2022-01-08 | 0.660826 | 1.017961 | 0.569261 | -1.209834 | 1.056544 |
| 2022-01-09 | 0.125091 | -0.340940 | 1.023370 | -1.555463 | 0.322947 |
| 2022-01-10 | 1.915190 | 0.314069 | -0.638422 | -1.023607 | -1.544831 |
df[1:10]
| A | B | C | D | E | |
|---|---|---|---|---|---|
| 2022-01-02 | 1.382429 | -1.708730 | 1.176728 | 1.949761 | -0.492011 |
| 2022-01-03 | 0.325283 | -0.834825 | -0.879866 | -0.137038 | 0.919688 |
| 2022-01-04 | 0.552770 | -0.454804 | -0.747360 | 1.800993 | -0.275458 |
| 2022-01-05 | -0.896864 | -0.958084 | 0.579021 | 0.478870 | 0.260666 |
| 2022-01-06 | -3.103231 | -0.340718 | 0.050150 | 0.058931 | -1.712098 |
| 2022-01-07 | 1.769682 | -1.402604 | -2.110711 | -0.922010 | -0.753877 |
| 2022-01-08 | 0.660826 | 1.017961 | 0.569261 | -1.209834 | 1.056544 |
| 2022-01-09 | 0.125091 | -0.340940 | 1.023370 | -1.555463 | 0.322947 |
| 2022-01-10 | 1.915190 | 0.314069 | -0.638422 | -1.023607 | -1.544831 |
df1.head()
| A | B | C | D | E | |
|---|---|---|---|---|---|
| 2022-01-01 | 0.388135 | -0.694500 | 1.710742 | -0.953853 | 1.647141 |
| 2022-01-02 | -0.517575 | -0.188147 | 0.630036 | -0.881991 | -0.981272 |
| 2022-01-03 | -1.674983 | -0.586637 | -1.340947 | 0.702661 | -0.929540 |
| 2022-01-04 | 0.581906 | -0.080027 | 1.137075 | 0.775445 | -1.796040 |
| 2022-01-05 | -1.273825 | 0.312914 | -1.165314 | -0.169640 | -0.368941 |
# showing the only 16th row and its values
df.loc[dates[15]]
A -0.152087 B 0.789873 C -1.096439 D -0.172554 E -1.109436 Name: 2022-01-16 00:00:00, dtype: float64
#multiple axis lables
df.loc[:, ["A","B"]]
| A | B | |
|---|---|---|
| 2022-01-01 | -1.424660 | 0.772210 |
| 2022-01-02 | 1.382429 | -1.708730 |
| 2022-01-03 | 0.325283 | -0.834825 |
| 2022-01-04 | 0.552770 | -0.454804 |
| 2022-01-05 | -0.896864 | -0.958084 |
| 2022-01-06 | -3.103231 | -0.340718 |
| 2022-01-07 | 1.769682 | -1.402604 |
| 2022-01-08 | 0.660826 | 1.017961 |
| 2022-01-09 | 0.125091 | -0.340940 |
| 2022-01-10 | 1.915190 | 0.314069 |
| 2022-01-11 | -0.637742 | -1.434976 |
| 2022-01-12 | -0.742913 | -1.484980 |
| 2022-01-13 | 0.201580 | 0.786547 |
| 2022-01-14 | 1.321449 | 0.050355 |
| 2022-01-15 | -0.436737 | -0.262544 |
| 2022-01-16 | -0.152087 | 0.789873 |
| 2022-01-17 | 0.379340 | -0.193847 |
| 2022-01-18 | 0.785079 | -1.145713 |
| 2022-01-19 | -1.454019 | 0.974247 |
| 2022-01-20 | 0.503909 | -0.776441 |
| 2022-01-21 | -0.270772 | 0.698006 |
| 2022-01-22 | -0.528683 | 0.258139 |
| 2022-01-23 | 0.240170 | -0.518060 |
| 2022-01-24 | 0.615130 | -0.661989 |
| 2022-01-25 | -0.344456 | 1.276411 |
| 2022-01-26 | -0.073498 | 0.611255 |
| 2022-01-27 | -1.981745 | 0.243877 |
| 2022-01-28 | -0.023202 | 0.546315 |
| 2022-01-29 | 0.201237 | 0.241144 |
| 2022-01-30 | 0.073743 | -0.597354 |
| 2022-01-31 | -0.752144 | -0.738474 |
| 2022-02-01 | 0.635539 | 0.457294 |
| 2022-02-02 | 1.761750 | 0.542268 |
df.loc["20220109":"20220113",["A","B", "C"]]
| A | B | C | |
|---|---|---|---|
| 2022-01-09 | 0.125091 | -0.340940 | 1.023370 |
| 2022-01-10 | 1.915190 | 0.314069 | -0.638422 |
| 2022-01-11 | -0.637742 | -1.434976 | -1.276443 |
| 2022-01-12 | -0.742913 | -1.484980 | -0.917490 |
| 2022-01-13 | 0.201580 | 0.786547 | -1.459392 |
df.loc["20220109",["A","B", "C"]]
A 0.125091 B -0.340940 C 1.023370 Name: 2022-01-09 00:00:00, dtype: float64
#Scalar value
df.at[dates[0],"A"]
-1.4246598703006963
df.iloc[3]
A 0.552770 B -0.454804 C -0.747360 D 1.800993 E -0.275458 Name: 2022-01-04 00:00:00, dtype: float64
df.iloc[3:10]
| A | B | C | D | E | |
|---|---|---|---|---|---|
| 2022-01-04 | 0.552770 | -0.454804 | -0.747360 | 1.800993 | -0.275458 |
| 2022-01-05 | -0.896864 | -0.958084 | 0.579021 | 0.478870 | 0.260666 |
| 2022-01-06 | -3.103231 | -0.340718 | 0.050150 | 0.058931 | -1.712098 |
| 2022-01-07 | 1.769682 | -1.402604 | -2.110711 | -0.922010 | -0.753877 |
| 2022-01-08 | 0.660826 | 1.017961 | 0.569261 | -1.209834 | 1.056544 |
| 2022-01-09 | 0.125091 | -0.340940 | 1.023370 | -1.555463 | 0.322947 |
| 2022-01-10 | 1.915190 | 0.314069 | -0.638422 | -1.023607 | -1.544831 |
# rows col
df.iloc[0:5, 0:2]
| A | B | |
|---|---|---|
| 2022-01-01 | -1.424660 | 0.772210 |
| 2022-01-02 | 1.382429 | -1.708730 |
| 2022-01-03 | 0.325283 | -0.834825 |
| 2022-01-04 | 0.552770 | -0.454804 |
| 2022-01-05 | -0.896864 | -0.958084 |
df.iloc[:, 0:2]
| A | B | |
|---|---|---|
| 2022-01-01 | -1.424660 | 0.772210 |
| 2022-01-02 | 1.382429 | -1.708730 |
| 2022-01-03 | 0.325283 | -0.834825 |
| 2022-01-04 | 0.552770 | -0.454804 |
| 2022-01-05 | -0.896864 | -0.958084 |
| 2022-01-06 | -3.103231 | -0.340718 |
| 2022-01-07 | 1.769682 | -1.402604 |
| 2022-01-08 | 0.660826 | 1.017961 |
| 2022-01-09 | 0.125091 | -0.340940 |
| 2022-01-10 | 1.915190 | 0.314069 |
| 2022-01-11 | -0.637742 | -1.434976 |
| 2022-01-12 | -0.742913 | -1.484980 |
| 2022-01-13 | 0.201580 | 0.786547 |
| 2022-01-14 | 1.321449 | 0.050355 |
| 2022-01-15 | -0.436737 | -0.262544 |
| 2022-01-16 | -0.152087 | 0.789873 |
| 2022-01-17 | 0.379340 | -0.193847 |
| 2022-01-18 | 0.785079 | -1.145713 |
| 2022-01-19 | -1.454019 | 0.974247 |
| 2022-01-20 | 0.503909 | -0.776441 |
| 2022-01-21 | -0.270772 | 0.698006 |
| 2022-01-22 | -0.528683 | 0.258139 |
| 2022-01-23 | 0.240170 | -0.518060 |
| 2022-01-24 | 0.615130 | -0.661989 |
| 2022-01-25 | -0.344456 | 1.276411 |
| 2022-01-26 | -0.073498 | 0.611255 |
| 2022-01-27 | -1.981745 | 0.243877 |
| 2022-01-28 | -0.023202 | 0.546315 |
| 2022-01-29 | 0.201237 | 0.241144 |
| 2022-01-30 | 0.073743 | -0.597354 |
| 2022-01-31 | -0.752144 | -0.738474 |
| 2022-02-01 | 0.635539 | 0.457294 |
| 2022-02-02 | 1.761750 | 0.542268 |
df[df["A"]> 0]
| A | B | C | D | E | |
|---|---|---|---|---|---|
| 2022-01-02 | 1.382429 | -1.708730 | 1.176728 | 1.949761 | -0.492011 |
| 2022-01-03 | 0.325283 | -0.834825 | -0.879866 | -0.137038 | 0.919688 |
| 2022-01-04 | 0.552770 | -0.454804 | -0.747360 | 1.800993 | -0.275458 |
| 2022-01-07 | 1.769682 | -1.402604 | -2.110711 | -0.922010 | -0.753877 |
| 2022-01-08 | 0.660826 | 1.017961 | 0.569261 | -1.209834 | 1.056544 |
| 2022-01-09 | 0.125091 | -0.340940 | 1.023370 | -1.555463 | 0.322947 |
| 2022-01-10 | 1.915190 | 0.314069 | -0.638422 | -1.023607 | -1.544831 |
| 2022-01-13 | 0.201580 | 0.786547 | -1.459392 | 0.126138 | -0.312369 |
| 2022-01-14 | 1.321449 | 0.050355 | -0.933186 | -1.043294 | 1.676111 |
| 2022-01-17 | 0.379340 | -0.193847 | -1.390857 | -1.056517 | -0.158073 |
| 2022-01-18 | 0.785079 | -1.145713 | 0.233175 | -0.294474 | -0.493845 |
| 2022-01-20 | 0.503909 | -0.776441 | -0.323965 | 0.813752 | -0.409261 |
| 2022-01-23 | 0.240170 | -0.518060 | 0.402815 | -1.852271 | -0.960517 |
| 2022-01-24 | 0.615130 | -0.661989 | 1.052420 | 0.529098 | -2.626316 |
| 2022-01-29 | 0.201237 | 0.241144 | 1.538946 | -0.113433 | 2.243980 |
| 2022-01-30 | 0.073743 | -0.597354 | -0.399961 | 0.624458 | -0.305179 |
| 2022-02-01 | 0.635539 | 0.457294 | -0.037721 | -0.169999 | -0.373402 |
| 2022-02-02 | 1.761750 | 0.542268 | 0.059822 | 0.817176 | 0.607860 |
df[df["A"]> 0]
| A | B | C | D | E | |
|---|---|---|---|---|---|
| 2022-01-02 | 1.382429 | -1.708730 | 1.176728 | 1.949761 | -0.492011 |
| 2022-01-03 | 0.325283 | -0.834825 | -0.879866 | -0.137038 | 0.919688 |
| 2022-01-04 | 0.552770 | -0.454804 | -0.747360 | 1.800993 | -0.275458 |
| 2022-01-07 | 1.769682 | -1.402604 | -2.110711 | -0.922010 | -0.753877 |
| 2022-01-08 | 0.660826 | 1.017961 | 0.569261 | -1.209834 | 1.056544 |
| 2022-01-09 | 0.125091 | -0.340940 | 1.023370 | -1.555463 | 0.322947 |
| 2022-01-10 | 1.915190 | 0.314069 | -0.638422 | -1.023607 | -1.544831 |
| 2022-01-13 | 0.201580 | 0.786547 | -1.459392 | 0.126138 | -0.312369 |
| 2022-01-14 | 1.321449 | 0.050355 | -0.933186 | -1.043294 | 1.676111 |
| 2022-01-17 | 0.379340 | -0.193847 | -1.390857 | -1.056517 | -0.158073 |
| 2022-01-18 | 0.785079 | -1.145713 | 0.233175 | -0.294474 | -0.493845 |
| 2022-01-20 | 0.503909 | -0.776441 | -0.323965 | 0.813752 | -0.409261 |
| 2022-01-23 | 0.240170 | -0.518060 | 0.402815 | -1.852271 | -0.960517 |
| 2022-01-24 | 0.615130 | -0.661989 | 1.052420 | 0.529098 | -2.626316 |
| 2022-01-29 | 0.201237 | 0.241144 | 1.538946 | -0.113433 | 2.243980 |
| 2022-01-30 | 0.073743 | -0.597354 | -0.399961 | 0.624458 | -0.305179 |
| 2022-02-01 | 0.635539 | 0.457294 | -0.037721 | -0.169999 | -0.373402 |
| 2022-02-02 | 1.761750 | 0.542268 | 0.059822 | 0.817176 | 0.607860 |
# assignment :Getting non zero values in more then one column
df[ df.iloc[:, 0:5]> 0]
| A | B | C | D | E | |
|---|---|---|---|---|---|
| 2022-01-01 | NaN | 0.772210 | 0.783466 | NaN | NaN |
| 2022-01-02 | 1.382429 | NaN | 1.176728 | 1.949761 | NaN |
| 2022-01-03 | 0.325283 | NaN | NaN | NaN | 0.919688 |
| 2022-01-04 | 0.552770 | NaN | NaN | 1.800993 | NaN |
| 2022-01-05 | NaN | NaN | 0.579021 | 0.478870 | 0.260666 |
| 2022-01-06 | NaN | NaN | 0.050150 | 0.058931 | NaN |
| 2022-01-07 | 1.769682 | NaN | NaN | NaN | NaN |
| 2022-01-08 | 0.660826 | 1.017961 | 0.569261 | NaN | 1.056544 |
| 2022-01-09 | 0.125091 | NaN | 1.023370 | NaN | 0.322947 |
| 2022-01-10 | 1.915190 | 0.314069 | NaN | NaN | NaN |
| 2022-01-11 | NaN | NaN | NaN | NaN | 0.761682 |
| 2022-01-12 | NaN | NaN | NaN | NaN | NaN |
| 2022-01-13 | 0.201580 | 0.786547 | NaN | 0.126138 | NaN |
| 2022-01-14 | 1.321449 | 0.050355 | NaN | NaN | 1.676111 |
| 2022-01-15 | NaN | NaN | 0.686336 | 0.144395 | 0.625462 |
| 2022-01-16 | NaN | 0.789873 | NaN | NaN | NaN |
| 2022-01-17 | 0.379340 | NaN | NaN | NaN | NaN |
| 2022-01-18 | 0.785079 | NaN | 0.233175 | NaN | NaN |
| 2022-01-19 | NaN | 0.974247 | NaN | 0.044247 | 1.190501 |
| 2022-01-20 | 0.503909 | NaN | NaN | 0.813752 | NaN |
| 2022-01-21 | NaN | 0.698006 | NaN | NaN | 0.852357 |
| 2022-01-22 | NaN | 0.258139 | 0.589976 | 0.325295 | 1.688333 |
| 2022-01-23 | 0.240170 | NaN | 0.402815 | NaN | NaN |
| 2022-01-24 | 0.615130 | NaN | 1.052420 | 0.529098 | NaN |
| 2022-01-25 | NaN | 1.276411 | NaN | NaN | NaN |
| 2022-01-26 | NaN | 0.611255 | 1.133547 | 0.358266 | NaN |
| 2022-01-27 | NaN | 0.243877 | NaN | 0.343524 | 1.155793 |
| 2022-01-28 | NaN | 0.546315 | NaN | NaN | 0.220736 |
| 2022-01-29 | 0.201237 | 0.241144 | 1.538946 | NaN | 2.243980 |
| 2022-01-30 | 0.073743 | NaN | NaN | 0.624458 | NaN |
| 2022-01-31 | NaN | NaN | 0.248588 | 0.375347 | 0.729071 |
| 2022-02-01 | 0.635539 | 0.457294 | NaN | NaN | NaN |
| 2022-02-02 | 1.761750 | 0.542268 | 0.059822 | 0.817176 | 0.607860 |
df[df["A"]> 0]
| A | B | C | D | E | |
|---|---|---|---|---|---|
| 2022-01-02 | 1.382429 | -1.708730 | 1.176728 | 1.949761 | -0.492011 |
| 2022-01-03 | 0.325283 | -0.834825 | -0.879866 | -0.137038 | 0.919688 |
| 2022-01-04 | 0.552770 | -0.454804 | -0.747360 | 1.800993 | -0.275458 |
| 2022-01-07 | 1.769682 | -1.402604 | -2.110711 | -0.922010 | -0.753877 |
| 2022-01-08 | 0.660826 | 1.017961 | 0.569261 | -1.209834 | 1.056544 |
| 2022-01-09 | 0.125091 | -0.340940 | 1.023370 | -1.555463 | 0.322947 |
| 2022-01-10 | 1.915190 | 0.314069 | -0.638422 | -1.023607 | -1.544831 |
| 2022-01-13 | 0.201580 | 0.786547 | -1.459392 | 0.126138 | -0.312369 |
| 2022-01-14 | 1.321449 | 0.050355 | -0.933186 | -1.043294 | 1.676111 |
| 2022-01-17 | 0.379340 | -0.193847 | -1.390857 | -1.056517 | -0.158073 |
| 2022-01-18 | 0.785079 | -1.145713 | 0.233175 | -0.294474 | -0.493845 |
| 2022-01-20 | 0.503909 | -0.776441 | -0.323965 | 0.813752 | -0.409261 |
| 2022-01-23 | 0.240170 | -0.518060 | 0.402815 | -1.852271 | -0.960517 |
| 2022-01-24 | 0.615130 | -0.661989 | 1.052420 | 0.529098 | -2.626316 |
| 2022-01-29 | 0.201237 | 0.241144 | 1.538946 | -0.113433 | 2.243980 |
| 2022-01-30 | 0.073743 | -0.597354 | -0.399961 | 0.624458 | -0.305179 |
| 2022-02-01 | 0.635539 | 0.457294 | -0.037721 | -0.169999 | -0.373402 |
| 2022-02-02 | 1.761750 | 0.542268 | 0.059822 | 0.817176 | 0.607860 |
df[df>0]
| A | B | C | D | E | |
|---|---|---|---|---|---|
| 2022-01-01 | NaN | 0.772210 | 0.783466 | NaN | NaN |
| 2022-01-02 | 1.382429 | NaN | 1.176728 | 1.949761 | NaN |
| 2022-01-03 | 0.325283 | NaN | NaN | NaN | 0.919688 |
| 2022-01-04 | 0.552770 | NaN | NaN | 1.800993 | NaN |
| 2022-01-05 | NaN | NaN | 0.579021 | 0.478870 | 0.260666 |
| 2022-01-06 | NaN | NaN | 0.050150 | 0.058931 | NaN |
| 2022-01-07 | 1.769682 | NaN | NaN | NaN | NaN |
| 2022-01-08 | 0.660826 | 1.017961 | 0.569261 | NaN | 1.056544 |
| 2022-01-09 | 0.125091 | NaN | 1.023370 | NaN | 0.322947 |
| 2022-01-10 | 1.915190 | 0.314069 | NaN | NaN | NaN |
| 2022-01-11 | NaN | NaN | NaN | NaN | 0.761682 |
| 2022-01-12 | NaN | NaN | NaN | NaN | NaN |
| 2022-01-13 | 0.201580 | 0.786547 | NaN | 0.126138 | NaN |
| 2022-01-14 | 1.321449 | 0.050355 | NaN | NaN | 1.676111 |
| 2022-01-15 | NaN | NaN | 0.686336 | 0.144395 | 0.625462 |
| 2022-01-16 | NaN | 0.789873 | NaN | NaN | NaN |
| 2022-01-17 | 0.379340 | NaN | NaN | NaN | NaN |
| 2022-01-18 | 0.785079 | NaN | 0.233175 | NaN | NaN |
| 2022-01-19 | NaN | 0.974247 | NaN | 0.044247 | 1.190501 |
| 2022-01-20 | 0.503909 | NaN | NaN | 0.813752 | NaN |
| 2022-01-21 | NaN | 0.698006 | NaN | NaN | 0.852357 |
| 2022-01-22 | NaN | 0.258139 | 0.589976 | 0.325295 | 1.688333 |
| 2022-01-23 | 0.240170 | NaN | 0.402815 | NaN | NaN |
| 2022-01-24 | 0.615130 | NaN | 1.052420 | 0.529098 | NaN |
| 2022-01-25 | NaN | 1.276411 | NaN | NaN | NaN |
| 2022-01-26 | NaN | 0.611255 | 1.133547 | 0.358266 | NaN |
| 2022-01-27 | NaN | 0.243877 | NaN | 0.343524 | 1.155793 |
| 2022-01-28 | NaN | 0.546315 | NaN | NaN | 0.220736 |
| 2022-01-29 | 0.201237 | 0.241144 | 1.538946 | NaN | 2.243980 |
| 2022-01-30 | 0.073743 | NaN | NaN | 0.624458 | NaN |
| 2022-01-31 | NaN | NaN | 0.248588 | 0.375347 | 0.729071 |
| 2022-02-01 | 0.635539 | 0.457294 | NaN | NaN | NaN |
| 2022-02-02 | 1.761750 | 0.542268 | 0.059822 | 0.817176 | 0.607860 |
df[ df.iloc[0:3]> 0 ]
| A | B | C | D | E | |
|---|---|---|---|---|---|
| 2022-01-01 | NaN | 0.77221 | 0.783466 | NaN | NaN |
| 2022-01-02 | 1.382429 | NaN | 1.176728 | 1.949761 | NaN |
| 2022-01-03 | 0.325283 | NaN | NaN | NaN | 0.919688 |
| 2022-01-04 | NaN | NaN | NaN | NaN | NaN |
| 2022-01-05 | NaN | NaN | NaN | NaN | NaN |
| 2022-01-06 | NaN | NaN | NaN | NaN | NaN |
| 2022-01-07 | NaN | NaN | NaN | NaN | NaN |
| 2022-01-08 | NaN | NaN | NaN | NaN | NaN |
| 2022-01-09 | NaN | NaN | NaN | NaN | NaN |
| 2022-01-10 | NaN | NaN | NaN | NaN | NaN |
| 2022-01-11 | NaN | NaN | NaN | NaN | NaN |
| 2022-01-12 | NaN | NaN | NaN | NaN | NaN |
| 2022-01-13 | NaN | NaN | NaN | NaN | NaN |
| 2022-01-14 | NaN | NaN | NaN | NaN | NaN |
| 2022-01-15 | NaN | NaN | NaN | NaN | NaN |
| 2022-01-16 | NaN | NaN | NaN | NaN | NaN |
| 2022-01-17 | NaN | NaN | NaN | NaN | NaN |
| 2022-01-18 | NaN | NaN | NaN | NaN | NaN |
| 2022-01-19 | NaN | NaN | NaN | NaN | NaN |
| 2022-01-20 | NaN | NaN | NaN | NaN | NaN |
| 2022-01-21 | NaN | NaN | NaN | NaN | NaN |
| 2022-01-22 | NaN | NaN | NaN | NaN | NaN |
| 2022-01-23 | NaN | NaN | NaN | NaN | NaN |
| 2022-01-24 | NaN | NaN | NaN | NaN | NaN |
| 2022-01-25 | NaN | NaN | NaN | NaN | NaN |
| 2022-01-26 | NaN | NaN | NaN | NaN | NaN |
| 2022-01-27 | NaN | NaN | NaN | NaN | NaN |
| 2022-01-28 | NaN | NaN | NaN | NaN | NaN |
| 2022-01-29 | NaN | NaN | NaN | NaN | NaN |
| 2022-01-30 | NaN | NaN | NaN | NaN | NaN |
| 2022-01-31 | NaN | NaN | NaN | NaN | NaN |
| 2022-02-01 | NaN | NaN | NaN | NaN | NaN |
| 2022-02-02 | NaN | NaN | NaN | NaN | NaN |
# IS IN Method
df3 = df.copy()
df3
| A | B | C | D | E | |
|---|---|---|---|---|---|
| 2022-01-01 | -1.424660 | 0.772210 | 0.783466 | -0.712659 | -2.551065 |
| 2022-01-02 | 1.382429 | -1.708730 | 1.176728 | 1.949761 | -0.492011 |
| 2022-01-03 | 0.325283 | -0.834825 | -0.879866 | -0.137038 | 0.919688 |
| 2022-01-04 | 0.552770 | -0.454804 | -0.747360 | 1.800993 | -0.275458 |
| 2022-01-05 | -0.896864 | -0.958084 | 0.579021 | 0.478870 | 0.260666 |
| 2022-01-06 | -3.103231 | -0.340718 | 0.050150 | 0.058931 | -1.712098 |
| 2022-01-07 | 1.769682 | -1.402604 | -2.110711 | -0.922010 | -0.753877 |
| 2022-01-08 | 0.660826 | 1.017961 | 0.569261 | -1.209834 | 1.056544 |
| 2022-01-09 | 0.125091 | -0.340940 | 1.023370 | -1.555463 | 0.322947 |
| 2022-01-10 | 1.915190 | 0.314069 | -0.638422 | -1.023607 | -1.544831 |
| 2022-01-11 | -0.637742 | -1.434976 | -1.276443 | -1.153942 | 0.761682 |
| 2022-01-12 | -0.742913 | -1.484980 | -0.917490 | -0.283180 | -0.053965 |
| 2022-01-13 | 0.201580 | 0.786547 | -1.459392 | 0.126138 | -0.312369 |
| 2022-01-14 | 1.321449 | 0.050355 | -0.933186 | -1.043294 | 1.676111 |
| 2022-01-15 | -0.436737 | -0.262544 | 0.686336 | 0.144395 | 0.625462 |
| 2022-01-16 | -0.152087 | 0.789873 | -1.096439 | -0.172554 | -1.109436 |
| 2022-01-17 | 0.379340 | -0.193847 | -1.390857 | -1.056517 | -0.158073 |
| 2022-01-18 | 0.785079 | -1.145713 | 0.233175 | -0.294474 | -0.493845 |
| 2022-01-19 | -1.454019 | 0.974247 | -0.442348 | 0.044247 | 1.190501 |
| 2022-01-20 | 0.503909 | -0.776441 | -0.323965 | 0.813752 | -0.409261 |
| 2022-01-21 | -0.270772 | 0.698006 | -0.801287 | -0.182554 | 0.852357 |
| 2022-01-22 | -0.528683 | 0.258139 | 0.589976 | 0.325295 | 1.688333 |
| 2022-01-23 | 0.240170 | -0.518060 | 0.402815 | -1.852271 | -0.960517 |
| 2022-01-24 | 0.615130 | -0.661989 | 1.052420 | 0.529098 | -2.626316 |
| 2022-01-25 | -0.344456 | 1.276411 | -0.625939 | -1.117180 | -0.680828 |
| 2022-01-26 | -0.073498 | 0.611255 | 1.133547 | 0.358266 | -0.694200 |
| 2022-01-27 | -1.981745 | 0.243877 | -0.122814 | 0.343524 | 1.155793 |
| 2022-01-28 | -0.023202 | 0.546315 | -0.150933 | -0.316904 | 0.220736 |
| 2022-01-29 | 0.201237 | 0.241144 | 1.538946 | -0.113433 | 2.243980 |
| 2022-01-30 | 0.073743 | -0.597354 | -0.399961 | 0.624458 | -0.305179 |
| 2022-01-31 | -0.752144 | -0.738474 | 0.248588 | 0.375347 | 0.729071 |
| 2022-02-01 | 0.635539 | 0.457294 | -0.037721 | -0.169999 | -0.373402 |
| 2022-02-02 | 1.761750 | 0.542268 | 0.059822 | 0.817176 | 0.607860 |
#Adding a column
df3["BABA"]= [1,2,3,4,5,6,7,8,9,10,1,2,3,4,5,6,7,8,9,10,1,2,3,4,5,6,7,8,9,10,1,2,3]
df3
| A | B | C | D | E | BABA | |
|---|---|---|---|---|---|---|
| 2022-01-01 | -1.424660 | 0.772210 | 0.783466 | -0.712659 | -2.551065 | 1 |
| 2022-01-02 | 1.382429 | -1.708730 | 1.176728 | 1.949761 | -0.492011 | 2 |
| 2022-01-03 | 0.325283 | -0.834825 | -0.879866 | -0.137038 | 0.919688 | 3 |
| 2022-01-04 | 0.552770 | -0.454804 | -0.747360 | 1.800993 | -0.275458 | 4 |
| 2022-01-05 | -0.896864 | -0.958084 | 0.579021 | 0.478870 | 0.260666 | 5 |
| 2022-01-06 | -3.103231 | -0.340718 | 0.050150 | 0.058931 | -1.712098 | 6 |
| 2022-01-07 | 1.769682 | -1.402604 | -2.110711 | -0.922010 | -0.753877 | 7 |
| 2022-01-08 | 0.660826 | 1.017961 | 0.569261 | -1.209834 | 1.056544 | 8 |
| 2022-01-09 | 0.125091 | -0.340940 | 1.023370 | -1.555463 | 0.322947 | 9 |
| 2022-01-10 | 1.915190 | 0.314069 | -0.638422 | -1.023607 | -1.544831 | 10 |
| 2022-01-11 | -0.637742 | -1.434976 | -1.276443 | -1.153942 | 0.761682 | 1 |
| 2022-01-12 | -0.742913 | -1.484980 | -0.917490 | -0.283180 | -0.053965 | 2 |
| 2022-01-13 | 0.201580 | 0.786547 | -1.459392 | 0.126138 | -0.312369 | 3 |
| 2022-01-14 | 1.321449 | 0.050355 | -0.933186 | -1.043294 | 1.676111 | 4 |
| 2022-01-15 | -0.436737 | -0.262544 | 0.686336 | 0.144395 | 0.625462 | 5 |
| 2022-01-16 | -0.152087 | 0.789873 | -1.096439 | -0.172554 | -1.109436 | 6 |
| 2022-01-17 | 0.379340 | -0.193847 | -1.390857 | -1.056517 | -0.158073 | 7 |
| 2022-01-18 | 0.785079 | -1.145713 | 0.233175 | -0.294474 | -0.493845 | 8 |
| 2022-01-19 | -1.454019 | 0.974247 | -0.442348 | 0.044247 | 1.190501 | 9 |
| 2022-01-20 | 0.503909 | -0.776441 | -0.323965 | 0.813752 | -0.409261 | 10 |
| 2022-01-21 | -0.270772 | 0.698006 | -0.801287 | -0.182554 | 0.852357 | 1 |
| 2022-01-22 | -0.528683 | 0.258139 | 0.589976 | 0.325295 | 1.688333 | 2 |
| 2022-01-23 | 0.240170 | -0.518060 | 0.402815 | -1.852271 | -0.960517 | 3 |
| 2022-01-24 | 0.615130 | -0.661989 | 1.052420 | 0.529098 | -2.626316 | 4 |
| 2022-01-25 | -0.344456 | 1.276411 | -0.625939 | -1.117180 | -0.680828 | 5 |
| 2022-01-26 | -0.073498 | 0.611255 | 1.133547 | 0.358266 | -0.694200 | 6 |
| 2022-01-27 | -1.981745 | 0.243877 | -0.122814 | 0.343524 | 1.155793 | 7 |
| 2022-01-28 | -0.023202 | 0.546315 | -0.150933 | -0.316904 | 0.220736 | 8 |
| 2022-01-29 | 0.201237 | 0.241144 | 1.538946 | -0.113433 | 2.243980 | 9 |
| 2022-01-30 | 0.073743 | -0.597354 | -0.399961 | 0.624458 | -0.305179 | 10 |
| 2022-01-31 | -0.752144 | -0.738474 | 0.248588 | 0.375347 | 0.729071 | 1 |
| 2022-02-01 | 0.635539 | 0.457294 | -0.037721 | -0.169999 | -0.373402 | 2 |
| 2022-02-02 | 1.761750 | 0.542268 | 0.059822 | 0.817176 | 0.607860 | 3 |
#Adding a new column having same value of previous column
df3["Mean"] =df3["A"]
df3.head()
| A | B | C | D | E | BABA | Mean | |
|---|---|---|---|---|---|---|---|
| 2022-01-01 | -1.424660 | 0.772210 | 0.783466 | -0.712659 | -2.551065 | 1 | -1.424660 |
| 2022-01-02 | 1.382429 | -1.708730 | 1.176728 | 1.949761 | -0.492011 | 2 | 1.382429 |
| 2022-01-03 | 0.325283 | -0.834825 | -0.879866 | -0.137038 | 0.919688 | 3 | 0.325283 |
| 2022-01-04 | 0.552770 | -0.454804 | -0.747360 | 1.800993 | -0.275458 | 4 | 0.552770 |
| 2022-01-05 | -0.896864 | -0.958084 | 0.579021 | 0.478870 | 0.260666 | 5 | -0.896864 |
#Adding a column having mean of previous values
# #Assignment no 2
df3["Mean"] =df3.mean(axis= 1)
df3.head()
| A | B | C | D | E | BABA | Mean | |
|---|---|---|---|---|---|---|---|
| 2022-01-01 | -1.424660 | 0.772210 | 0.783466 | -0.712659 | -2.551065 | 1 | -0.508195 |
| 2022-01-02 | 1.382429 | -1.708730 | 1.176728 | 1.949761 | -0.492011 | 2 | 0.812944 |
| 2022-01-03 | 0.325283 | -0.834825 | -0.879866 | -0.137038 | 0.919688 | 3 | 0.388361 |
| 2022-01-04 | 0.552770 | -0.454804 | -0.747360 | 1.800993 | -0.275458 | 4 | 0.775559 |
| 2022-01-05 | -0.896864 | -0.958084 | 0.579021 | 0.478870 | 0.260666 | 5 | 0.509535 |
df["New"]="new hai "
df.head()
| A | B | C | D | E | New | |
|---|---|---|---|---|---|---|
| 2022-01-01 | -1.424660 | 0.772210 | 0.783466 | -0.712659 | -2.551065 | new hai |
| 2022-01-02 | 1.382429 | -1.708730 | 1.176728 | 1.949761 | -0.492011 | new hai |
| 2022-01-03 | 0.325283 | -0.834825 | -0.879866 | -0.137038 | 0.919688 | new hai |
| 2022-01-04 | 0.552770 | -0.454804 | -0.747360 | 1.800993 | -0.275458 | new hai |
| 2022-01-05 | -0.896864 | -0.958084 | 0.579021 | 0.478870 | 0.260666 | new hai |
df.insert(2,"Beech me ", "18+")
df.head()
| A | B | Beech me | C | D | E | New | |
|---|---|---|---|---|---|---|---|
| 2022-01-01 | -1.424660 | 0.772210 | 18+ | 0.783466 | -0.712659 | -2.551065 | new hai |
| 2022-01-02 | 1.382429 | -1.708730 | 18+ | 1.176728 | 1.949761 | -0.492011 | new hai |
| 2022-01-03 | 0.325283 | -0.834825 | 18+ | -0.879866 | -0.137038 | 0.919688 | new hai |
| 2022-01-04 | 0.552770 | -0.454804 | 18+ | -0.747360 | 1.800993 | -0.275458 | new hai |
| 2022-01-05 | -0.896864 | -0.958084 | 18+ | 0.579021 | 0.478870 | 0.260666 | new hai |
df["Concatinated"]= df["A"]+df["B"]
df.head()
| A | B | Beech me | C | D | E | New | Concatinated | |
|---|---|---|---|---|---|---|---|---|
| 2022-01-01 | -1.424660 | 0.772210 | 18+ | 0.783466 | -0.712659 | -2.551065 | new hai | -0.652450 |
| 2022-01-02 | 1.382429 | -1.708730 | 18+ | 1.176728 | 1.949761 | -0.492011 | new hai | -0.326301 |
| 2022-01-03 | 0.325283 | -0.834825 | 18+ | -0.879866 | -0.137038 | 0.919688 | new hai | -0.509542 |
| 2022-01-04 | 0.552770 | -0.454804 | 18+ | -0.747360 | 1.800993 | -0.275458 | new hai | 0.097965 |
| 2022-01-05 | -0.896864 | -0.958084 | 18+ | 0.579021 | 0.478870 | 0.260666 | new hai | -1.854949 |
#importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
kashti= sns.load_dataset("titanic")
kashti.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 891 entries, 0 to 890 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 survived 891 non-null int64 1 pclass 891 non-null int64 2 sex 891 non-null object 3 age 714 non-null float64 4 sibsp 891 non-null int64 5 parch 891 non-null int64 6 fare 891 non-null float64 7 embarked 889 non-null object 8 class 891 non-null category 9 who 891 non-null object 10 adult_male 891 non-null bool 11 deck 203 non-null category 12 embark_town 889 non-null object 13 alive 891 non-null object 14 alone 891 non-null bool dtypes: bool(2), category(2), float64(2), int64(4), object(5) memory usage: 80.7+ KB
ks= kashti
#just to see the datset
ks.head()
| survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | deck | embark_town | alive | alone | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S | Third | man | True | NaN | Southampton | no | False |
| 1 | 1 | 1 | female | 38.0 | 1 | 0 | 71.2833 | C | First | woman | False | C | Cherbourg | yes | False |
| 2 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S | Third | woman | False | NaN | Southampton | yes | True |
| 3 | 1 | 1 | female | 35.0 | 1 | 0 | 53.1000 | S | First | woman | False | C | Southampton | yes | False |
| 4 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S | Third | man | True | NaN | Southampton | no | True |
ks.shape
#Rows x column
(891, 15)
ks.tail()
| survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | deck | embark_town | alive | alone | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 886 | 0 | 2 | male | 27.0 | 0 | 0 | 13.00 | S | Second | man | True | NaN | Southampton | no | True |
| 887 | 1 | 1 | female | 19.0 | 0 | 0 | 30.00 | S | First | woman | False | B | Southampton | yes | True |
| 888 | 0 | 3 | female | NaN | 1 | 2 | 23.45 | S | Third | woman | False | NaN | Southampton | no | False |
| 889 | 1 | 1 | male | 26.0 | 0 | 0 | 30.00 | C | First | man | True | C | Cherbourg | yes | True |
| 890 | 0 | 3 | male | 32.0 | 0 | 0 | 7.75 | Q | Third | man | True | NaN | Queenstown | no | True |
ks.describe()
| survived | pclass | age | sibsp | parch | fare | |
|---|---|---|---|---|---|---|
| count | 891.000000 | 891.000000 | 714.000000 | 891.000000 | 891.000000 | 891.000000 |
| mean | 0.383838 | 2.308642 | 29.699118 | 0.523008 | 0.381594 | 32.204208 |
| std | 0.486592 | 0.836071 | 14.526497 | 1.102743 | 0.806057 | 49.693429 |
| min | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 0.000000 | 2.000000 | 20.125000 | 0.000000 | 0.000000 | 7.910400 |
| 50% | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 |
| 75% | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 0.000000 | 31.000000 |
| max | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 512.329200 |
# unique values
ks.nunique()
survived 2 pclass 3 sex 2 age 88 sibsp 7 parch 7 fare 248 embarked 3 class 3 who 3 adult_male 2 deck 7 embark_town 3 alive 2 alone 2 dtype: int64
# coloumn names
ks.columns
Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
'alive', 'alone'],
dtype='object')
ks["sex"].unique()
array(['male', 'female'], dtype=object)
ks['who'].unique()
array(['man', 'woman', 'child'], dtype=object)
np.union1d(ks["who"].unique(), ks["sex"].unique())
array(['child', 'female', 'male', 'man', 'woman'], dtype=object)
ks[['who', "sex"]].nunique()
who 3 sex 2 dtype: int64
# find missing valus inside
ks.isnull()
| survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | deck | embark_town | alive | alone | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False |
| 1 | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
| 2 | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False |
| 3 | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
| 4 | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False |
| 887 | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
| 888 | False | False | False | True | False | False | False | False | False | False | False | True | False | False | False |
| 889 | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
| 890 | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False |
891 rows × 15 columns
ks.isnull().sum()
survived 0 pclass 0 sex 0 age 177 sibsp 0 parch 0 fare 0 embarked 2 class 0 who 0 adult_male 0 deck 688 embark_town 2 alive 0 alone 0 dtype: int64
# removing missing value column
ks_clean= ks.drop (["deck"], axis= 1)
ks_clean.head()
| survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | embark_town | alive | alone | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S | Third | man | True | Southampton | no | False |
| 1 | 1 | 1 | female | 38.0 | 1 | 0 | 71.2833 | C | First | woman | False | Cherbourg | yes | False |
| 2 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S | Third | woman | False | Southampton | yes | True |
| 3 | 1 | 1 | female | 35.0 | 1 | 0 | 53.1000 | S | First | woman | False | Southampton | yes | False |
| 4 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S | Third | man | True | Southampton | no | True |
ks_clean.isnull().sum()
survived 0 pclass 0 sex 0 age 177 sibsp 0 parch 0 fare 0 embarked 2 class 0 who 0 adult_male 0 embark_town 2 alive 0 alone 0 dtype: int64
ks_clean.shape
(891, 14)
891-177-2
#117 row in age and 2 from embarked and embark town
712
ks_clean= ks_clean.dropna()
ks_clean.head()
| survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | embark_town | alive | alone | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S | Third | man | True | Southampton | no | False |
| 1 | 1 | 1 | female | 38.0 | 1 | 0 | 71.2833 | C | First | woman | False | Cherbourg | yes | False |
| 2 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S | Third | woman | False | Southampton | yes | True |
| 3 | 1 | 1 | female | 35.0 | 1 | 0 | 53.1000 | S | First | woman | False | Southampton | yes | False |
| 4 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S | Third | man | True | Southampton | no | True |
ks_clean.dropna().shape
(712, 14)
ks_clean.isnull().sum()
survived 0 pclass 0 sex 0 age 0 sibsp 0 parch 0 fare 0 embarked 0 class 0 who 0 adult_male 0 embark_town 0 alive 0 alone 0 dtype: int64
ks_clean.shape
(712, 14)
ks.shape
(891, 15)
ks_clean["age"].value_counts()
24.00 30
22.00 27
18.00 26
19.00 25
28.00 25
..
36.50 1
55.50 1
0.92 1
23.50 1
74.00 1
Name: age, Length: 88, dtype: int64
ks_clean["sex"].value_counts()
male 453 female 259 Name: sex, dtype: int64
ks_clean["sex"].value_counts()
male 453 female 259 Name: sex, dtype: int64
ks.describe()
| survived | pclass | age | sibsp | parch | fare | |
|---|---|---|---|---|---|---|
| count | 891.000000 | 891.000000 | 714.000000 | 891.000000 | 891.000000 | 891.000000 |
| mean | 0.383838 | 2.308642 | 29.699118 | 0.523008 | 0.381594 | 32.204208 |
| std | 0.486592 | 0.836071 | 14.526497 | 1.102743 | 0.806057 | 49.693429 |
| min | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 0.000000 | 2.000000 | 20.125000 | 0.000000 | 0.000000 | 7.910400 |
| 50% | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 |
| 75% | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 0.000000 | 31.000000 |
| max | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 512.329200 |
ks_clean.describe()
| survived | pclass | age | sibsp | parch | fare | |
|---|---|---|---|---|---|---|
| count | 712.000000 | 712.000000 | 712.000000 | 712.000000 | 712.000000 | 712.000000 |
| mean | 0.404494 | 2.240169 | 29.642093 | 0.514045 | 0.432584 | 34.567251 |
| std | 0.491139 | 0.836854 | 14.492933 | 0.930692 | 0.854181 | 52.938648 |
| min | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 0.000000 | 1.000000 | 20.000000 | 0.000000 | 0.000000 | 8.050000 |
| 50% | 0.000000 | 2.000000 | 28.000000 | 0.000000 | 0.000000 | 15.645850 |
| 75% | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 1.000000 | 33.000000 |
| max | 1.000000 | 3.000000 | 80.000000 | 5.000000 | 6.000000 | 512.329200 |
ks_clean.columns
Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
'embarked', 'class', 'who', 'adult_male', 'embark_town', 'alive',
'alone'],
dtype='object')
sns.boxplot(x= "sex", y='age', data =ks_clean )
# here we can see the outlyers in age
<AxesSubplot:xlabel='sex', ylabel='age'>
sns.boxplot(y='age', data =ks_clean )
<AxesSubplot:ylabel='age'>
sns.distplot(ks_clean["age"] )
# here we are seeing the bell curve / histogram for normality check
# here we can see that it is not perfectly bell curve means data is not perfect
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
<AxesSubplot:xlabel='age', ylabel='Density'>
ks_clean["age"].mean()
29.64209269662921
ks_clean.head()
| survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | embark_town | alive | alone | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S | Third | man | True | Southampton | no | False |
| 1 | 1 | 1 | female | 38.0 | 1 | 0 | 71.2833 | C | First | woman | False | Cherbourg | yes | False |
| 2 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S | Third | woman | False | Southampton | yes | True |
| 3 | 1 | 1 | female | 35.0 | 1 | 0 | 53.1000 | S | First | woman | False | Southampton | yes | False |
| 4 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S | Third | man | True | Southampton | no | True |
# Removing an out liers
ks_clean= ks_clean[ks_clean["age"]< 68]
ks_clean.head()
| survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | embark_town | alive | alone | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S | Third | man | True | Southampton | no | False |
| 1 | 1 | 1 | female | 38.0 | 1 | 0 | 71.2833 | C | First | woman | False | Cherbourg | yes | False |
| 2 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S | Third | woman | False | Southampton | yes | True |
| 3 | 1 | 1 | female | 35.0 | 1 | 0 | 53.1000 | S | First | woman | False | Southampton | yes | False |
| 4 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S | Third | man | True | Southampton | no | True |
ks_clean.shape
(705, 14)
ks_clean["age"].mean()
29.21797163120567
sns.distplot(ks_clean["age"] )
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
<AxesSubplot:xlabel='age', ylabel='Density'>
sns.boxplot(y='age', data =ks_clean )
<AxesSubplot:ylabel='age'>
ks_clean.head()
| survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | embark_town | alive | alone | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S | Third | man | True | Southampton | no | False |
| 1 | 1 | 1 | female | 38.0 | 1 | 0 | 71.2833 | C | First | woman | False | Cherbourg | yes | False |
| 2 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S | Third | woman | False | Southampton | yes | True |
| 3 | 1 | 1 | female | 35.0 | 1 | 0 | 53.1000 | S | First | woman | False | Southampton | yes | False |
| 4 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S | Third | man | True | Southampton | no | True |
ks_clean.boxplot()
<AxesSubplot:>
ks_clean= ks_clean[ks_clean["fare"]< 300]
ks_clean.boxplot()
<AxesSubplot:>
sns.distplot(ks_clean["fare"] )
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
<AxesSubplot:xlabel='fare', ylabel='Density'>
ks_clean.hist()
array([[<AxesSubplot:title={'center':'survived'}>,
<AxesSubplot:title={'center':'pclass'}>],
[<AxesSubplot:title={'center':'age'}>,
<AxesSubplot:title={'center':'sibsp'}>],
[<AxesSubplot:title={'center':'parch'}>,
<AxesSubplot:title={'center':'fare'}>]], dtype=object)
pd.value_counts(ks_clean["survived"])
0 418 1 284 Name: survived, dtype: int64
pd.value_counts(ks_clean["survived"]).plot.bar()
<AxesSubplot:>
pd.value_counts(ks_clean["sex"]).plot.bar()
<AxesSubplot:>
ks_clean.groupby(["sex","class","who"]).mean()
| survived | pclass | age | sibsp | parch | fare | adult_male | alone | |||
|---|---|---|---|---|---|---|---|---|---|---|
| sex | class | who | ||||||||
| female | First | child | 0.666667 | 1.0 | 10.333333 | 0.666667 | 1.666667 | 160.962500 | 0.0 | 0.000000 |
| man | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ||
| woman | 0.974684 | 1.0 | 35.139241 | 0.556962 | 0.468354 | 101.521730 | 0.0 | 0.367089 | ||
| Second | child | 1.000000 | 2.0 | 6.600000 | 0.700000 | 1.300000 | 29.240000 | 0.0 | 0.000000 | |
| man | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ||
| woman | 0.906250 | 2.0 | 32.179688 | 0.468750 | 0.515625 | 20.812175 | 0.0 | 0.468750 | ||
| Third | child | 0.533333 | 3.0 | 7.100000 | 1.533333 | 1.100000 | 19.023753 | 0.0 | 0.166667 | |
| man | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ||
| woman | 0.430556 | 3.0 | 27.854167 | 0.527778 | 0.888889 | 14.563542 | 0.0 | 0.458333 | ||
| male | First | child | 1.000000 | 1.0 | 5.306667 | 0.666667 | 2.000000 | 117.802767 | 0.0 | 0.000000 |
| man | 0.369565 | 1.0 | 41.201087 | 0.380435 | 0.282609 | 61.110824 | 1.0 | 0.543478 | ||
| woman | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ||
| Second | child | 1.000000 | 2.0 | 2.258889 | 0.888889 | 1.222222 | 27.306022 | 0.0 | 0.000000 | |
| man | 0.067416 | 2.0 | 33.179775 | 0.325843 | 0.146067 | 20.606133 | 1.0 | 0.696629 | ||
| woman | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ||
| Third | child | 0.321429 | 3.0 | 6.515000 | 2.821429 | 1.321429 | 27.716371 | 0.0 | 0.035714 | |
| man | 0.130045 | 3.0 | 28.607623 | 0.201794 | 0.125561 | 10.249231 | 1.0 | 0.825112 | ||
| woman | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
cor_ks_clean= ks_clean.corr()
cor_ks_clean
| survived | pclass | age | sibsp | parch | fare | adult_male | alone | |
|---|---|---|---|---|---|---|---|---|
| survived | 1.000000 | -0.356549 | -0.074335 | -0.014483 | 0.095426 | 0.273531 | -0.554567 | -0.201175 |
| pclass | -0.356549 | 1.000000 | -0.365121 | 0.061354 | 0.022519 | -0.617591 | 0.102930 | 0.156030 |
| age | -0.074335 | -0.365121 | 1.000000 | -0.308906 | -0.186271 | 0.103100 | 0.275035 | 0.187284 |
| sibsp | -0.014483 | 0.061354 | -0.308906 | 1.000000 | 0.381803 | 0.197954 | -0.311622 | -0.629200 |
| parch | 0.095426 | 0.022519 | -0.186271 | 0.381803 | 1.000000 | 0.259948 | -0.366540 | -0.574701 |
| fare | 0.273531 | -0.617591 | 0.103100 | 0.197954 | 0.259948 | 1.000000 | -0.228675 | -0.333949 |
| adult_male | -0.554567 | 0.102930 | 0.275035 | -0.311622 | -0.366540 | -0.228675 | 1.000000 | 0.402214 |
| alone | -0.201175 | 0.156030 | 0.187284 | -0.629200 | -0.574701 | -0.333949 | 0.402214 | 1.000000 |
sns.heatmap(cor_ks_clean)
<AxesSubplot:>
sns.heatmap(cor_ks_clean , annot=True)
<AxesSubplot:>
sns.relplot(x= "age", y="fare", hue="sex", data=ks_clean)
<seaborn.axisgrid.FacetGrid at 0x25b2f22ab90>
sns.catplot(x= "sex", y="fare",data=ks_clean)
<seaborn.axisgrid.FacetGrid at 0x25b2f39ed10>
sns.catplot(x= "sex", y="fare",data=ks_clean, kind= "bar")
<seaborn.axisgrid.FacetGrid at 0x25b2c0b28c0>
sns.catplot(x= "sex", y="age", hue= "who" , data=ks_clean, kind= "box")
<seaborn.axisgrid.FacetGrid at 0x25b2c69dc00>
sns.catplot(x= "sex", y="fare", hue= "who" , data=ks_clean, kind= "box")
<seaborn.axisgrid.FacetGrid at 0x25b3066b850>
ks_clean.head()
| survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | embark_town | alive | alone | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S | Third | man | True | Southampton | no | False |
| 1 | 1 | 1 | female | 38.0 | 1 | 0 | 71.2833 | C | First | woman | False | Cherbourg | yes | False |
| 2 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S | Third | woman | False | Southampton | yes | True |
| 3 | 1 | 1 | female | 35.0 | 1 | 0 | 53.1000 | S | First | woman | False | Southampton | yes | False |
| 4 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S | Third | man | True | Southampton | no | True |
# log transformation
ks_clean["fare_log"] = np.log(ks_clean["fare"])
ks_clean.head()
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\arraylike.py:364: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs)
| survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | embark_town | alive | alone | fare_log | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S | Third | man | True | Southampton | no | False | 1.981001 |
| 1 | 1 | 1 | female | 38.0 | 1 | 0 | 71.2833 | C | First | woman | False | Cherbourg | yes | False | 4.266662 |
| 2 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S | Third | woman | False | Southampton | yes | True | 2.070022 |
| 3 | 1 | 1 | female | 35.0 | 1 | 0 | 53.1000 | S | First | woman | False | Southampton | yes | False | 3.972177 |
| 4 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S | Third | man | True | Southampton | no | True | 2.085672 |
sns.catplot(x= "sex", y="fare_log" , data=ks_clean, kind= "box")
<seaborn.axisgrid.FacetGrid at 0x25b306c1510>
# import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
kashti = sns.load_dataset("titanic")
ks1 = kashti
# # ks2 = kashti
kashti.head()
| survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | deck | embark_town | alive | alone | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S | Third | man | True | NaN | Southampton | no | False |
| 1 | 1 | 1 | female | 38.0 | 1 | 0 | 71.2833 | C | First | woman | False | C | Cherbourg | yes | False |
| 2 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S | Third | woman | False | NaN | Southampton | yes | True |
| 3 | 1 | 1 | female | 35.0 | 1 | 0 | 53.1000 | S | First | woman | False | C | Southampton | yes | False |
| 4 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S | Third | man | True | NaN | Southampton | no | True |
# simple operations (Math operations)
(kashti["age"]+12).head(10)
0 34.0 1 50.0 2 38.0 3 47.0 4 47.0 5 NaN 6 66.0 7 14.0 8 39.0 9 26.0 Name: age, dtype: float64
(kashti["age"]*2).head(3)
0 44.0 1 76.0 2 52.0 Name: age, dtype: float64
Steps:
1- Koshsih kren dobara data collect kar len agar kahin ghalti hai.
2- missing value wala variable (coloumn) hi nikal den agr data per effect nahi hta ya simple row or data entry remove kr den .
3- Replace the missing values :
- How?
1. Average value of entire variable or similar data poit
2. frequency or MODE replacement
3. Replace based on other functions (Data sampler knows that)
4. ML algorithm can also be used
5. Leave it like that
- Why?
1. Its better beacuse no data is lost
2. Less accurate
# where exactly missing value are ?
kashti.isnull().sum()
survived 0 pclass 0 sex 0 age 177 sibsp 0 parch 0 fare 0 embarked 2 class 0 who 0 adult_male 0 deck 688 embark_town 2 alive 0 alone 0 dtype: int64
# use a drop.na method
print (kashti.shape)
(891, 15)
# removing missing value column
# ks_clean= ks.drop (["deck"], axis= 1)
# ks_clean.head()
kashti.dropna(subset=["deck"], axis=0 , inplace = True)
kashti.shape
(203, 15)
kashti.isnull().sum()
survived 0 pclass 0 sex 0 age 19 sibsp 0 parch 0 fare 0 embarked 2 class 0 who 0 adult_male 0 deck 0 embark_town 2 alive 0 alone 0 dtype: int64
# removing na from whole dataframe
kashti = kashti.dropna()
kashti.isnull().sum()
survived 0 pclass 0 sex 0 age 0 sibsp 0 parch 0 fare 0 embarked 0 class 0 who 0 adult_male 0 deck 0 embark_town 0 alive 0 alone 0 dtype: int64
kashti.shape
(182, 15)
ks1.isnull().sum()
survived 0 pclass 0 sex 0 age 19 sibsp 0 parch 0 fare 0 embarked 2 class 0 who 0 adult_male 0 deck 0 embark_town 2 alive 0 alone 0 dtype: int64
# finding an average (mean)
mean = ks1["age"].mean()
mean
35.77945652173913
# replacing Nan with mean of the data (udating as well)
ks1["age"] = ks1["age"].replace(np.nan , mean)
ks1["age"].head(10)
1 38.000000 3 35.000000 6 54.000000 10 4.000000 11 58.000000 21 34.000000 23 28.000000 27 19.000000 31 35.779457 52 49.000000 Name: age, dtype: float64
ks1.isnull().sum()
survived 0 pclass 0 sex 0 age 0 sibsp 0 parch 0 fare 0 embarked 2 class 0 who 0 adult_male 0 deck 0 embark_town 2 alive 0 alone 0 dtype: int64
mode = ks1["deck"].mode()[0]
mode
'C'
ks1['deck'].fillna(mode , inplace=True)
ks1['deck'].head(12)
1 C 3 C 6 E 10 G 11 C 21 D 23 A 27 C 31 B 52 D 54 B 55 C Name: deck, dtype: category Categories (7, object): ['A', 'B', 'C', 'D', 'E', 'F', 'G']
ks1.dropna(subset=["embarked"], axis=0 , inplace = True)
ks1.isnull().sum()
survived 0 pclass 0 sex 0 age 0 sibsp 0 parch 0 fare 0 embarked 0 class 0 who 0 adult_male 0 deck 0 embark_town 0 alive 0 alone 0 dtype: int64
some others
cols = ["workclass", "native-country"]
df[cols]=df[cols].fillna(df.mode().iloc[0])
or we can use a mapping
ks1['deck'].map({"A" : np.nan})
# know the data type and convert it into the know one
kashti.dtypes
survived int64 pclass int64 sex object age float64 sibsp int64 parch int64 fare float64 embarked object class category who object adult_male bool deck category embark_town object alive object alone bool dtype: object
# use this method to convert datatype from one to another format
kashti["survived"]= kashti["survived"].astype("float64")
kashti.dtypes
C:\Users\Epazz\AppData\Local\Temp/ipykernel_12924/2743140032.py:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
kashti["survived"]= kashti["survived"].astype("float64")
survived float64 pclass int64 sex object age float64 sibsp int64 parch int64 fare float64 embarked object class category who object adult_male bool deck category embark_town object alive object alone bool dtype: object
# here we will convert age into days instead of years
ks1["age"] = ks1["age"]*365
ks1["age"].head(8)
1 13870.0 3 12775.0 6 19710.0 10 1460.0 11 21170.0 21 12410.0 23 10220.0 27 6935.0 Name: age, dtype: float64
# always rename afterwards
ks1.rename(columns={"age": "age in days"}, inplace=True)
ks1.head()
| survived | pclass | sex | age in days | sibsp | parch | fare | embarked | class | who | adult_male | deck | embark_town | alive | alone | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 1 | 1 | female | 13870.0 | 1 | 0 | 71.2833 | C | First | woman | False | C | Cherbourg | yes | False |
| 3 | 1 | 1 | female | 12775.0 | 1 | 0 | 53.1000 | S | First | woman | False | C | Southampton | yes | False |
| 6 | 0 | 1 | male | 19710.0 | 0 | 0 | 51.8625 | S | First | man | True | E | Southampton | no | True |
| 10 | 1 | 3 | female | 1460.0 | 1 | 1 | 16.7000 | S | Third | child | False | G | Southampton | yes | False |
| 11 | 1 | 1 | female | 21170.0 | 0 | 0 | 26.5500 | S | First | woman | False | C | Southampton | yes | True |
ks1["age in days"]= ks1["age in days"].astype("int64")
ks1.head()
| survived | pclass | sex | age in days | sibsp | parch | fare | embarked | class | who | adult_male | deck | embark_town | alive | alone | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 1 | 1 | female | 13870 | 1 | 0 | 71.2833 | C | First | woman | False | C | Cherbourg | yes | False |
| 3 | 1 | 1 | female | 12775 | 1 | 0 | 53.1000 | S | First | woman | False | C | Southampton | yes | False |
| 6 | 0 | 1 | male | 19710 | 0 | 0 | 51.8625 | S | First | man | True | E | Southampton | no | True |
| 10 | 1 | 3 | female | 1460 | 1 | 1 | 16.7000 | S | Third | child | False | G | Southampton | yes | False |
| 11 | 1 | 1 | female | 21170 | 0 | 0 | 26.5500 | S | First | woman | False | C | Southampton | yes | True |
ks1.head()
| survived | pclass | sex | age in days | sibsp | parch | fare | embarked | class | who | adult_male | deck | embark_town | alive | alone | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 1 | 1 | female | 13870 | 1 | 0 | 71.2833 | C | First | woman | False | C | Cherbourg | yes | False |
| 3 | 1 | 1 | female | 12775 | 1 | 0 | 53.1000 | S | First | woman | False | C | Southampton | yes | False |
| 6 | 0 | 1 | male | 19710 | 0 | 0 | 51.8625 | S | First | man | True | E | Southampton | no | True |
| 10 | 1 | 3 | female | 1460 | 1 | 1 | 16.7000 | S | Third | child | False | G | Southampton | yes | False |
| 11 | 1 | 1 | female | 21170 | 0 | 0 | 26.5500 | S | First | woman | False | C | Southampton | yes | True |
ks4= ks1[["age in days","fare"]]
ks4.head()
| age in days | fare | |
|---|---|---|
| 1 | 13870 | 71.2833 |
| 3 | 12775 | 53.1000 |
| 6 | 19710 | 51.8625 |
| 10 | 1460 | 16.7000 |
| 11 | 21170 | 26.5500 |
# simple feature scalling
ks4["fare"]= ks4["fare"]/ks4["fare"].max()
ks4["age in days"]= ks4["age in days"]/ks4["age in days"].max()
ks4.head()
C:\Users\Epazz\AppData\Local\Temp/ipykernel_12924/607069502.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy ks4["fare"]= ks4["fare"]/ks4["fare"].max() C:\Users\Epazz\AppData\Local\Temp/ipykernel_12924/607069502.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy ks4["age in days"]= ks4["age in days"]/ks4["age in days"].max()
| age in days | fare | |
|---|---|---|
| 1 | 0.4750 | 0.139136 |
| 3 | 0.4375 | 0.103644 |
| 6 | 0.6750 | 0.101229 |
| 10 | 0.0500 | 0.032596 |
| 11 | 0.7250 | 0.051822 |
# Min- Max method
# x.new = (x.old - x.min) / (x.max- x.min)
ks4["fare"] = (ks4["fare"]-ks4["fare"].min())/ (ks4["fare"].max()-ks4["fare"].min())
ks4.head()
C:\Users\Epazz\AppData\Local\Temp/ipykernel_12924/4070571851.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy ks4["fare"] = (ks4["fare"]-ks4["fare"].min())/ (ks4["fare"].max()-ks4["fare"].min())
| age in days | fare | |
|---|---|---|
| 1 | 0.4750 | 0.139136 |
| 3 | 0.4375 | 0.103644 |
| 6 | 0.6750 | 0.101229 |
| 10 | 0.0500 | 0.032596 |
| 11 | 0.7250 | 0.051822 |
# z-score method
# x.new = (x.old -x.mean) / x.std
ks4["fare"] =(ks4["fare"]- ks4["fare"].mean()) / ks4["fare"].std()
ks4.head()
C:\Users\Epazz\AppData\Local\Temp/ipykernel_12924/3694817774.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy ks4["fare"] =(ks4["fare"]- ks4["fare"].mean()) / ks4["fare"].std()
| age in days | fare | |
|---|---|---|
| 1 | 0.4750 | -0.067057 |
| 3 | 0.4375 | -0.309853 |
| 6 | 0.6750 | -0.326377 |
| 10 | 0.0500 | -0.795891 |
| 11 | 0.7250 | -0.664367 |
ks =sns.load_dataset("titanic")
ks["fare"].head()
0 7.2500 1 71.2833 2 7.9250 3 53.1000 4 8.0500 Name: fare, dtype: float64
# log transfromation
# x.new = np.log(x.old)
ks["fare"] = np.log(ks["fare"])
ks["fare"].head()
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\arraylike.py:364: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs)
0 1.981001 1 4.266662 2 2.070022 3 3.972177 4 2.085672 Name: fare, dtype: float64
kashti.head()
| survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | deck | embark_town | alive | alone | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 1.0 | 1 | female | 38.0 | 1 | 0 | 71.2833 | C | First | woman | False | C | Cherbourg | yes | False |
| 3 | 1.0 | 1 | female | 35.0 | 1 | 0 | 53.1000 | S | First | woman | False | C | Southampton | yes | False |
| 6 | 0.0 | 1 | male | 54.0 | 0 | 0 | 51.8625 | S | First | man | True | E | Southampton | no | True |
| 10 | 1.0 | 3 | female | 4.0 | 1 | 1 | 16.7000 | S | Third | child | False | G | Southampton | yes | False |
| 11 | 1.0 | 1 | female | 58.0 | 0 | 0 | 26.5500 | S | First | woman | False | C | Southampton | yes | True |
ks4["age in days"].shape
(201,)
# Creating bins
bins = np.linspace(min(ks1["age in days"]), max(ks1["age in days"]), 4)
bins
array([ 335. , 9956.66666667, 19578.33333333, 29200. ])
age_groups = ["bachay", "jawan", "Boorhay"]
ks1["age in days"] = pd.cut(ks1["age in days"], bins, labels= age_groups ,include_lowest=True )
ks1["age in days"]
1 jawan
3 jawan
6 Boorhay
10 bachay
11 Boorhay
...
871 jawan
872 jawan
879 Boorhay
887 bachay
889 bachay
Name: age in days, Length: 201, dtype: category
Categories (3, object): ['bachay' < 'jawan' < 'Boorhay']
ks4.head()
| age in days | fare | |
|---|---|---|
| 1 | 0.4750 | -0.067057 |
| 3 | 0.4375 | -0.309853 |
| 6 | 0.6750 | -0.326377 |
| 10 | 0.0500 | -0.795891 |
| 11 | 0.7250 | -0.664367 |
ks1.head()
| survived | pclass | sex | age in days | sibsp | parch | fare | embarked | class | who | adult_male | deck | embark_town | alive | alone | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 1 | 1 | female | jawan | 1 | 0 | 71.2833 | C | First | woman | False | C | Cherbourg | yes | False |
| 3 | 1 | 1 | female | jawan | 1 | 0 | 53.1000 | S | First | woman | False | C | Southampton | yes | False |
| 6 | 0 | 1 | male | Boorhay | 0 | 0 | 51.8625 | S | First | man | True | E | Southampton | no | True |
| 10 | 1 | 3 | female | bachay | 1 | 1 | 16.7000 | S | Third | child | False | G | Southampton | yes | False |
| 11 | 1 | 1 | female | Boorhay | 0 | 0 | 26.5500 | S | First | woman | False | C | Southampton | yes | True |
ks5 = sns.load_dataset("titanic")
data = pd.get_dummies(ks5["sex"])
# Drop column
ks5 = ks5.drop('sex',axis = 1)
# Join
ks5 = ks5.join(data)
ks5.head()
| survived | pclass | age | sibsp | parch | fare | embarked | class | who | adult_male | deck | embark_town | alive | alone | female | male | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | 22.0 | 1 | 0 | 7.2500 | S | Third | man | True | NaN | Southampton | no | False | 0 | 1 |
| 1 | 1 | 1 | 38.0 | 1 | 0 | 71.2833 | C | First | woman | False | C | Cherbourg | yes | False | 1 | 0 |
| 2 | 1 | 3 | 26.0 | 0 | 0 | 7.9250 | S | Third | woman | False | NaN | Southampton | yes | True | 1 | 0 |
| 3 | 1 | 1 | 35.0 | 1 | 0 | 53.1000 | S | First | woman | False | C | Southampton | yes | False | 1 | 0 |
| 4 | 0 | 3 | 35.0 | 0 | 0 | 8.0500 | S | Third | man | True | NaN | Southampton | no | True | 0 | 1 |
The design which is used when the experimental material is limited and homogeneous is known as completely randomized design. This design is specially used for pot culture experiments.
The whole field is divided into plots of similar shape and size. The number of plots is equal to the product of treatments and replications. These plots are then serially numbered
The experimental design which controls the fertility variation in one direction only is known as randomized block design (RBD). Adoption of this design is useful when the variation between the blocks is significant.
First the experimental field is divided into homogeneous groups equal to the number of replications. These homogeneous groups are known as blocks. Then each block is further divided into plots of similar shape and size equal to the number of treatments.
The experimental design which simultaneously controls the fertility variation in two directions is called Latin square design (LSD). In other words, Latin square designs are adopted for eliminating the variation of two factors which are generally called rows and columns.
In this design the field is divided into homogeneous blocks in two ways. The blocks in one direction are commonly known as rows and the blocks in other direction as columns. The number of plots in each row is the same as the number of plots in each column. This number is equal to the number of treatments.
The experimental design in which experimental plots are split or divided into main plots, subplots and ultimate-plots is called split plot design (SPD). In this design several factors are studied simultaneously with different levels of precision. The factors are such that some of them require larger plots like irrigation, depth of ploughing and sowing dates, and others require smaller plots.
The layout of this design consists of four steps as given below:
(a) First the experimental field is divided into homogeneous blocks equal to the number of replications
(b) Then each block is divided into a number of plots equal to the number of levels of the first factor. These plots are known as main plots.
(c) Then each main plot is divided into a number of sub-plots equal to the number of levels of second factor.
(d) Then each sub-plot is divided into a number of ultimate plots equal to the number of levels of third factor.
Lattice designs are incomplete block designs in which the number of varieties or treatments forms a square.
The experimental field is divided into homogeneous parts equal to the number of replications. Each part is further divided into plots of equal size in such a way that the number of plots should form a square and each replication has equal plots in each direction (i.e., equal rows and columns).
This is an experimental design which is used to test a large number of germplasm lines in a limited area.
In this design, standard or check varieties are replicated among the cultures. Thus, standards are replicated and cultures are non-replicated. The number of check varieties should be at least 4.
The variance of the variable in data are equal
Tesy to be used : **Levene's Test**
Know the purpose of the research question
Their are two types of purpose
1- Comparison
2- Relationship
Know the type of the data
- Catagorical = Qualitative
- Numerical = Quantitative
Choose a statistical test from three main families
1- Chi-Squared test
Purpose : Comparison
Data : Catagorical only
Types:
* Chi- squared test of homogeneity
* Chi- squared test of independence
2- t-test/ANOVA
Purpose : Comparison
Data : Catagorical and continuous
Types:
* One sample t-test
* Two sample t-test
** Un-paired t-test
** Paired t-test
* ANOVA
** One way ANOVA
** Two way ANOVA
** Repeated measures of ANOVA
* MANOVA
* MANCOVA
3- Correlation
Purpose : Relationship
Data : Continuous only
Types:
1- Pearson Correlation
2- Regression
Chi-Squared test
Chi-square is a statistical test used to examine the differences between categorical variables from a random sample in order to judge goodness of fit between expected and observed results.This is non parametric test.
One sample t-test
The One Sample t Test examines whether the mean of a population is statistically different from a known or hypothesized value. The One Sample t Test is a parametric test.
Two sample t-test or Independent Samples t-Test
The Independent Samples t Test compares the means of two independent groups in order to determine whether there is statistical evidence that the associated population means are significantly different. The Independent Samples t Test is a parametric test.
Unpaired t-test = Comparison between math marks of girls and boys (comparison between different population is onvolve)
Paired t-test = Comparison betwwen the math and stat marks of boys (boy= pne type of population and comparison is envolve in thier subjects)
One way ANOVA
"one-way" ANOVA compares levels (i.e. groups) of a single factor based on single continuous response variable (e.g. comparing *test score*(continous variable) by 'level of education')
One factor and one continous variable
Two way ANOVA
a "two-way" ANOVA compares levels of two or more factors for mean differences on a single continuous response variable(e.g. comparing test score by both 'level of education' and 'zodiac sign').
Two factor for one continous variable
Repeated measures of ANOVA
The repeated measures ANOVA compares means across one or more variables that are based on repeated observations. A repeated measures ANOVA model can also include zero or more independent variables. Again, a repeated measures ANOVA has at least 1 dependent variable that has more than one observation.
MANOVA "Multivariate Analysis of Variance"
In basic terms, A MANOVA is an ANOVA with two or more continuous response variables
One Way MANOVA
When comparing two or more continuous response variables by a single factor, a one-way MANOVA is appropriate (e.g. comparing ‘test score’ and ‘annual income’ together by ‘level of education’).
Two continous variable with one factor
Two way MANOVA
A two-way MANOVA also entails two or more continuous response variables, but compares them by at least two factors (e.g. comparing ‘test score’ and ‘annual income’ together by both ‘level of education’ and ‘zodiac sign’).
Two continous variable with two factors
MANCOVA (Multi-variate analysis of co-variance)
an analysis evolves from MANOVA to MANCOVA when one or more covariates are added to the mix.
eg: MANCOVA compares two or more continuous response variables (e.g.comparing continous variable (Test Scores and Annual Income) by levels of a factor variable (e.g. Level of Education), controlling for a covariate (e.g. Number of Hours Spent Studying).
Test whether a data sample has gaussian distribution.
from scipy.stats import shapiro
data = [0.873, 2.817 , 0.121, -0.945, -0.055, -1.436, 0.360 , -1.478, -1.637, -1.869]
shapiro(data)
#here we see that the p value is greater than 0.05 so the data is normally distributed
ShapiroResult(statistic=0.8951009511947632, pvalue=0.19340917468070984)
from scipy.stats import shapiro
data = [0.873, 2.817 , 0.121, -0.945, -0.055, -1.436, 0.360 , -1.478, -1.637, -1.869]
stat, p=shapiro(data)
print("stat =",stat)
print("p =" ,p)
stat = 0.8951009511947632 p = 0.19340917468070984
# To write it respectively
from scipy.stats import shapiro
data = [0.873, 2.817 , 0.121, -0.945, -0.055, -1.436, 0.360 , -1.478, -1.637, -1.869]
stat, p=shapiro(data)
print ("stat=%3f, p= %3f" %(stat, p))
# print("stat =",stat)
# print("p =" ,p)
stat=0.895101, p= 0.193409
#example of the Shapiro-WIlk Normality test
from scipy.stats import shapiro
data = [0.873, 2.817 , 0.121, -0.945, -0.055, -1.436, 0.360 , -1.478, -1.637, -1.869]
stat, p=shapiro(data)
print ("stat=%3f, p= %3f" %(stat, p))
if p> 0.05:
print("probability Gaussian")
else:
print ("Probability not gaussian")
stat=0.895101, p= 0.193409 probability Gaussian
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
ks = sns.load_dataset("titanic")
sns.boxplot(ks["age"])
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
<AxesSubplot:xlabel='age'>
ks["age"].hist()
<AxesSubplot:>
# Normality test
from scipy.stats import shapiro
stat, p=shapiro(ks["age"])
print("stat =",stat)
print("p =" ,p)
if p> 0.05:
print("probability Gaussian or the data is normal ")
else:
print ("Probability not gaussian or the data is not normal ")
stat = nan p = 1.0 probability Gaussian or the data is normal
from scipy.stats import shapiro
ks =ks.dropna()
stat, p=shapiro(ks["age"])
print("stat =",stat)
print("p =" ,p)
if p> 0.05:
print("probability Gaussian or the data is normal ")
else:
print ("Probability not gaussian or the data is not normal ")
stat = 0.9906661510467529 p = 0.28414419293403625 probability Gaussian or the data is normal
sns.boxplot(ks["age"])
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
<AxesSubplot:xlabel='age'>
from scipy.stats import shapiro
ks =ks.dropna()
stat, p=shapiro(ks["fare"])
print("stat =",stat)
print("p =" ,p)
if p> 0.05:
print("probability Gaussian or the data is normal ")
else:
print ("Probability not gaussian or the data is not normal ")
stat = 0.7430529594421387 p = 1.6486953687823121e-16 Probability not gaussian or the data is not normal
sns.boxplot(ks["fare"])
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
<AxesSubplot:xlabel='fare'>
ks["fare"].hist()
<AxesSubplot:>
Test whether two samples have a linear relationship
# Example of the Pearson's Correlation test
from scipy.stats import pearsonr
data1 = [0.873, 2.817, 0.121, -0.945, -0.055, -1.436, 0.360, -1.478, -1.637, -1.869]
data2 = [0.353, 3.517, 0.125, -7.545, -0.555, -1.536, 3.350, -1.578, -3.537, -1.579]
stat, p = pearsonr(data1, data2)
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
print('Probably independent(No correlation)')
else:
print('Probably dependent (correlation exists)')
stat=0.688, p=0.028 Probably dependent (correlation exists)
ks1 = sns.load_dataset("titanic")
# Example of the Pearson's Correlation test
from scipy.stats import pearsonr
stat, p = pearsonr(ks1["age"], ks1["fare"])
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
print('Probably independent')
else:
print('Probably dependent')
# here there is error as the array must not contain Nan values so we should drop them first
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) ~\AppData\Local\Temp/ipykernel_892/996795437.py in <module> 1 # Example of the Pearson's Correlation test 2 from scipy.stats import pearsonr ----> 3 stat, p = pearsonr(ks1["age"], ks1["fare"]) 4 print('stat=%.3f, p=%.3f' % (stat, p)) 5 if p > 0.05: ~\AppData\Local\Programs\Python\Python310\lib\site-packages\scipy\stats\stats.py in pearsonr(x, y) 4043 # scipy.linalg.norm(xm) does not overflow if xm is, for example, 4044 # [-5e210, 5e210, 3e200, -3e200] -> 4045 normxm = linalg.norm(xm) 4046 normym = linalg.norm(ym) 4047 ~\AppData\Local\Programs\Python\Python310\lib\site-packages\scipy\linalg\misc.py in norm(a, ord, axis, keepdims, check_finite) 143 # Differs from numpy only in non-finite handling and the use of blas. 144 if check_finite: --> 145 a = np.asarray_chkfinite(a) 146 else: 147 a = np.asarray(a) ~\AppData\Local\Programs\Python\Python310\lib\site-packages\numpy\lib\function_base.py in asarray_chkfinite(a, dtype, order) 601 a = asarray(a, dtype=dtype, order=order) 602 if a.dtype.char in typecodes['AllFloat'] and not np.isfinite(a).all(): --> 603 raise ValueError( 604 "array must not contain infs or NaNs") 605 return a ValueError: array must not contain infs or NaNs
# Example of the Pearson's Correlation test
from scipy.stats import pearsonr
ks1= ks1.dropna()
stat, p = pearsonr(ks1["age"], ks1["fare"])
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
print('Probably independent')
else:
print('Probably dependent')
stat=-0.091, p=0.223 Probably independent
Test whether two catagorical variables are related or independant
Test whether the means of two independent samples are significantly different
# Example of the Student's t-test
from scipy.stats import ttest_ind
data1 = [0.873, 2.817, 0.121, -0.945, -0.055, -1.436, 0.360, -1.478, -1.637, -1.869]
data2 = [1.142, -0.432, -0.938, -0.729, -0.846, -0.157, 0.500, 1.183, -1.075, -0.169]
stat, p = ttest_ind(data1, data2)
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
print('Probably the same dataset')
else:
print('Probably different data set')
stat=-0.326, p=0.748 Probably the same dataset
Test whether the means of two paired samples are significantly different.
# Example of the Paired Student's t-test
from scipy.stats import ttest_rel
data1 = [0.873, 2.817, 0.121, -0.945, -0.055, -1.436, 0.360, -1.478, -1.637, -1.869]
data2 = [1.142, -0.432, -0.938, -0.729, -0.846, -0.157, 0.500, 1.183, -1.075, -0.169]
stat, p = ttest_rel(data1, data2)
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
print('Probably the same distribution')
else:
print('Probably different distributions')
stat=-0.334, p=0.746 Probably the same distribution
Tests whether the means of two or more independent samples are significantly different.
# Example of the Analysis of Variance test
from scipy.stats import f_oneway
data1 = [0.873, 2.817, 0.121, -0.945, -0.055, -1.436, 0.360, -1.478, -1.637, -1.869]
data2 = [1.142, -0.432, -0.938, -0.729, -0.846, -0.157, 0.500, 1.183, -1.075, -0.169]
data3 = [-0.208, 0.696, 0.928, -1.148, -0.213, 0.229, 0.137, 0.269, -0.870, -1.204]
stat, p = f_oneway(data1, data2, data3)
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
print('Probably the same distribution')
else:
print('Probably different distributions')
stat=0.096, p=0.908 Probably the same distribution
# Some other test
# Post hoc tests
# - tuckey test
# - Bonfroni test
# Two way ANOVA
# MANOVA
# MANCOVA
$\sum$ = Summation
$n$ = Sample size
$N$ = Population size
$X$ = Individual value
$X_1$ =Particular first value
$X_i$ = For each individual value
$p$ = Proportion of sample data
$P$ = Proportion of polpulation data
$\bar{x}$ = Mean of sample data
$\mu$ = (mu) Mean of population
$\sigma$ = (sigma) Standard Deviation of population
$s$ = Stnadar Deviation of sample population
$s^2$ = Variance of sample data
$\sigma^2$ = Variance of Population
$R$ = Range of data
$\bar{R}$ = Average range of Data
$k$ = Multi-purpose notation
<,> = less than , greater than
$\ge$ = Greater than equal to
$\le$ = Less than equal to
$\alpha$ = type I error rate
$\beta$ = Regression Co-efficient of population
$\theta$ = Gerenal Popultion parameter
This is a normal text in markdown
This is a block of special text\ and the block is goin on as i have place a fwd slash
This is 40 days long corse of Data science with python also know as python ka chilla with baba ammar
this is a second line
For line braeak we can us double enter or fwd slash
Bold
italic
bold and italic
also we can use underscore (_) with (*) to do the same
bold and italic
to join this corse please scan the following QR code and join telegram group

TO print a string print("codanic")
to print a code in line
print("Hello")
to print a code in seprate block
this code will show systax according to python
x= 5+6
y=8+3
z= x+y
print(z)
`
this code will show systax according to R
x= 5+6
y=8+3
z= x+y
print(z)
`
| species | petal_length | Sepal_length |
|---|---|---|
| virgenica | 18.2 | 19.2 |
| setosa | 20 | 17.2 |
| setosa | 20 | 17.2 |
| setosa | 20 | 17.2 |
Example:
THis text is normal\ This text is red owe can even give a hex code to change the color
In-line math
$this_{2}^{3}$
or
$$ \int0^\infty \ frac{x^3}{e^x-1}\,dx= \frac{\pi^4}{15} $$Math block
for mmore information we can watch:[MathJax]
Input data ==> Training models and learning patterns of data ==> Prediction ==> Output and Reports
Training models and learning patterns of data by making different clusters ==> Prediction ==> Output and Reports
Input data ==> Training models and learning patterns of data ==> Prediction ==> Output and Reports
Input data ==> Training models and learning patterns of data [ WELL-DONE] ==> Prediction ==> Output and Reports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Importing the dataset
df = pd.read_csv('salary_data.csv')
df.head()
| YearsExperience | Salary | |
|---|---|---|
| 0 | 1.1 | 39343 |
| 1 | 1.3 | 46205 |
| 2 | 1.5 | 37731 |
| 3 | 2.0 | 43525 |
| 4 | 2.2 | 39891 |
X = df[["YearsExperience"]]
y = df["Salary"]
X.head()
| YearsExperience | |
|---|---|
| 0 | 1.1 |
| 1 | 1.3 |
| 2 | 1.5 |
| 3 | 2.0 |
| 4 | 2.2 |
y.head()
0 39343 1 46205 2 37731 3 43525 4 39891 Name: Salary, dtype: int64
#import library
from sklearn.model_selection import train_test_split
# slip the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
# use least square method to fit the line
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
model = LinearRegression().fit(X_train, y_train)
model
LinearRegression()
plt.scatter (X_train, y_train)
plt.plot(X_train, model.predict(X_train))
#plt.plot() ==> plot kro , plt.plot(X_train)==> X_train me , plt.plot(X_train)
# model.predict(X_train)) model ko pridict kro X_train se
[<matplotlib.lines.Line2D at 0x1dd06e01ab0>]
# Adding Colour
plt.scatter (X_train, y_train)
plt.plot(X_train, model.predict(X_train) ,color = "Green")
[<matplotlib.lines.Line2D at 0x1dd06e84310>]
#Adding labels
plt.scatter (X_train, y_train)
plt.plot(X_train, model.predict(X_train) ,color = "Green")
plt.xlabel("Tajurba")
plt.ylabel("Tankhwah")
plt.title("Train Plot")
plt.show
<function matplotlib.pyplot.show(close=None, block=None)>
# Pridicting testing data
plt.scatter (X_test, y_test)
plt.plot(X_train, model.predict(X_train) ,color = "red")
plt.xlabel("Tajurba")
plt.ylabel("Tankhwah")
plt.title("Test Plot")
plt.show
<function matplotlib.pyplot.show(close=None, block=None)>
# Model Fitness
# Jo library import ki hui hai usi k andr hi hum ne test score/regression score nikalna hai
print("Score for testing data =", model.score(X_test, y_test))
print("Score for training data =", model.score(X_train, y_train))
Score for testing data = 0.988169515729126 Score for training data = 0.9411949620562126
model.score(X_test, y_test)
0.988169515729126
model.score(X_train, y_train)
0.9411949620562126
model.predict([[5]])
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names warnings.warn(
array([73342.97478427])
model.predict(X_test)
array([ 40748.96184072, 122699.62295594, 64961.65717022, 63099.14214487,
115249.56285456, 107799.50275317])
model.predict([[5],[6],[10]])
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names warnings.warn(
array([ 73342.97478427, 82655.549911 , 119905.85041792])
a = ([[10], [20], [30], [5]])
model.predict(a)
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names warnings.warn(
array([119905.85041792, 213031.60168521, 306157.3529525 , 73342.97478427])
y_pred = model.predict(X_test)
y_pred
array([ 40748.96184072, 122699.62295594, 64961.65717022, 63099.14214487,
115249.56285456, 107799.50275317])
x_new = np.linspace(0, 30, 100)
y_new = model.predict(x_new[:, np.newaxis])
y_new
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names warnings.warn(
array([ 26780.09915063, 29602.09161327, 32424.08407592, 35246.07653856,
38068.06900121, 40890.06146385, 43712.0539265 , 46534.04638914,
49356.03885179, 52178.03131444, 55000.02377708, 57822.01623973,
60644.00870237, 63466.00116502, 66287.99362766, 69109.98609031,
71931.97855295, 74753.9710156 , 77575.96347824, 80397.95594089,
83219.94840353, 86041.94086618, 88863.93332882, 91685.92579147,
94507.91825411, 97329.91071676, 100151.9031794 , 102973.89564205,
105795.88810469, 108617.88056734, 111439.87302998, 114261.86549263,
117083.85795527, 119905.85041792, 122727.84288057, 125549.83534321,
128371.82780586, 131193.8202685 , 134015.81273115, 136837.80519379,
139659.79765644, 142481.79011908, 145303.78258173, 148125.77504437,
150947.76750702, 153769.75996966, 156591.75243231, 159413.74489495,
162235.7373576 , 165057.72982024, 167879.72228289, 170701.71474553,
173523.70720818, 176345.69967082, 179167.69213347, 181989.68459611,
184811.67705876, 187633.66952141, 190455.66198405, 193277.6544467 ,
196099.64690934, 198921.63937199, 201743.63183463, 204565.62429728,
207387.61675992, 210209.60922257, 213031.60168521, 215853.59414786,
218675.5866105 , 221497.57907315, 224319.57153579, 227141.56399844,
229963.55646108, 232785.54892373, 235607.54138637, 238429.53384902,
241251.52631166, 244073.51877431, 246895.51123695, 249717.5036996 ,
252539.49616224, 255361.48862489, 258183.48108754, 261005.47355018,
263827.46601283, 266649.45847547, 269471.45093812, 272293.44340076,
275115.43586341, 277937.42832605, 280759.4207887 , 283581.41325134,
286403.40571399, 289225.39817663, 292047.39063928, 294869.38310192,
297691.37556457, 300513.36802721, 303335.36048986, 306157.3529525 ])
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)
0.988169515729126
from sklearn import metrics
print (metrics.mean_absolute_error(y_test, y_pred))
print(metrics.mean_squared_error(y_test, y_pred))
print(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
# RAE
# RSE
2446.1723690465055 12823412.298126549 3580.979237321343
In this project we want to know the cost of health insurance of a person and will see how it may effect with the other factors
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
ins= pd.read_csv("insurance.csv")
#looking at first 5 rows of data set
ins.head(5)
| age | sex | bmi | children | smoker | region | charges | |
|---|---|---|---|---|---|---|---|
| 0 | 19 | female | 27.900 | 0 | yes | southwest | 16884.92400 |
| 1 | 18 | male | 33.770 | 1 | no | southeast | 1725.55230 |
| 2 | 28 | male | 33.000 | 3 | no | southeast | 4449.46200 |
| 3 | 33 | male | 22.705 | 0 | no | northwest | 21984.47061 |
| 4 | 32 | male | 28.880 | 0 | no | northwest | 3866.85520 |
#listing the column names
ins.columns
Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')
#Checking total no. of rows and coloumns
ins.shape
(1338, 7)
#overall info of data set , which type of data we have
ins.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1338 entries, 0 to 1337 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 1338 non-null int64 1 sex 1338 non-null object 2 bmi 1338 non-null float64 3 children 1338 non-null int64 4 smoker 1338 non-null object 5 region 1338 non-null object 6 charges 1338 non-null float64 dtypes: float64(2), int64(2), object(3) memory usage: 73.3+ KB
Here we have 3 catagorical features and 4 Numerical features
ins.isnull().sum()
age 0 sex 0 bmi 0 children 0 smoker 0 region 0 charges 0 dtype: int64
plt.figure(figsize=(10,6)) # Setting the size of required figure
sns.distplot(ins['charges'])
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
<AxesSubplot:xlabel='charges', ylabel='Density'>
plt.figure(figsize=(10,6))
sns.boxplot(ins["charges"])
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
<AxesSubplot:xlabel='charges'>
Here we see in interquartile range median is slighlty skewed to left also we have many outlyers , we might need to deal with this data
plt.figure(figsize=(10,6))
sns.distplot(ins['age'])
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
<AxesSubplot:xlabel='age', ylabel='Density'>
plt.figure(figsize=(10,6))
sns.boxplot(ins["age"])
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
<AxesSubplot:xlabel='age'>
Here we see the median is almost at midd in interquartile range
plt.figure(figsize=(10,6))
sns.distplot(ins['bmi'])
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
<AxesSubplot:xlabel='bmi', ylabel='Density'>
plt.figure(figsize=(10,6))
sns.boxplot(ins["bmi"])
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
<AxesSubplot:xlabel='bmi'>
Here we see the median is almost at mid in interquartile range with some outlyers
fig = plt.figure(figsize=(10,6))
sns.heatmap(ins.corr())
<AxesSubplot:>
# with values
fig = plt.figure(figsize=(10,6))
sns.heatmap(ins.corr(), annot=True)
<AxesSubplot:>
Here we see that charges are in relation with
sns.lineplot(x= "age", y= "charges" , data = ins )
<AxesSubplot:xlabel='age', ylabel='charges'>
sns.lineplot(x= "bmi", y= "charges" , data = ins )
<AxesSubplot:xlabel='bmi', ylabel='charges'>
sns.scatterplot(x='bmi', y='charges', data=ins )
<AxesSubplot:xlabel='bmi', ylabel='charges'>
sns.jointplot(x='bmi', y='charges', data=ins , kind='hex')
<seaborn.axisgrid.JointGrid at 0x1dd0a4ec5e0>
now we will check how the charges variates with our catagorical features
Now we will transform our catagorical values and after that will check the correlation
ins["sex"].unique()
array(['female', 'male'], dtype=object)
ins["sex"]= ins["sex"].replace("male", 1)
ins["sex"]= ins["sex"].replace("female", 0)
ins["smoker"].unique()
array(['yes', 'no'], dtype=object)
ins["smoker"]= ins["smoker"].replace("yes", 1)
ins["smoker"]= ins["smoker"].replace("no", 0)
ins["region"].unique()
array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)
# creating dummies for region coloumn
r_dummies = pd.get_dummies(ins["region"])
# Drop column
ins = ins.drop('region',axis = 1)
# Join
ins = ins.join(r_dummies)
ins.head()
| age | sex | bmi | children | smoker | charges | northeast | northwest | southeast | southwest | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 19 | 0 | 27.900 | 0 | 1 | 16884.92400 | 0 | 0 | 0 | 1 |
| 1 | 18 | 1 | 33.770 | 1 | 0 | 1725.55230 | 0 | 0 | 1 | 0 |
| 2 | 28 | 1 | 33.000 | 3 | 0 | 4449.46200 | 0 | 0 | 1 | 0 |
| 3 | 33 | 1 | 22.705 | 0 | 0 | 21984.47061 | 0 | 1 | 0 | 0 |
| 4 | 32 | 1 | 28.880 | 0 | 0 | 3866.85520 | 0 | 1 | 0 | 0 |
# now checking the correlation
fig = plt.figure(figsize=(12,6))
sns.heatmap(ins.corr(), annot=True)
<AxesSubplot:>
Here we see charges are in correaltion with
X= ins[["age","bmi", "smoker"]]
y= ins["charges"]
# importing Linear regression algorithm from sklearn library
from sklearn.linear_model import LinearRegression
# fitting our X and y values in this algorithm
model = LinearRegression().fit(X,y)
model
LinearRegression()
# getting prediction in following way
#model.predict([[age ,bmi , smoker]])
model.predict([[19 ,27.9, 1]])
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names warnings.warn(
array([26079.21861526])
#Cheching model co-efficient and intercept
print(model.coef_)
print(model.intercept_)
[ 259.54749155 322.61513282 23823.68449531] -11676.830425187778
coeff_ins = pd.DataFrame(model.coef_,X.columns,columns=['Co-efficient'])
coeff_ins
| Co-efficient | |
|---|---|
| age | 259.547492 |
| bmi | 322.615133 |
| smoker | 23823.684495 |
TO check the accurary we first have to split the data ,some for training the model and some for testting it
#import library
from sklearn.model_selection import train_test_split
# splitting the data : 80% training data and 20% testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
# fitting the training data in linear regression again
from sklearn.linear_model import LinearRegression
model1 = LinearRegression().fit(X_train, y_train)
model1
LinearRegression()
# getting a prediction again
model1.predict([[19 ,27.9, 1]])
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names warnings.warn(
array([25916.43448944])
# getting prediction for whole values we splitted for test
y_pred = model1.predict(X_test)
#importing library for test
from sklearn.metrics import r2_score
# now testing between already know y_test values and y_pred values
score = r2_score(y_test, y_pred)
score
0.7945500805653087
So according to this r2_score test our model is 79% accurate
from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
MAE: 3960.8666198087158 MSE: 32693237.938726168 RMSE 5717.80009607945
age = input('What is your age ? \n')
bmi = input('What is your bmi ? \n')
smoker= input('press 1 if you are a smoker and 0 if not \n')
try:
print('We predict {:.0f}$ will be your insurance cost .'.format(
model.predict([[float(age), float(bmi),int(smoker)]])[0]))
except ValueError:
print('Please only input either:\n- whole number e.g. 1, 4, 7\n- decimal/float number e.g. 3.8')
We predict 74168$ will be your insurance cost .
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names warnings.warn(
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
# Importing the dataset
df = pd.read_csv('ml_data_salary.csv')
df.head()
| age | distance | YearsExperience | Salary | |
|---|---|---|---|---|
| 0 | 31.1 | 77.75 | 1.1 | 39343 |
| 1 | 31.3 | 78.25 | 1.3 | 46205 |
| 2 | 31.5 | 78.75 | 1.5 | 37731 |
| 3 | 32.0 | 80.00 | 2.0 | 43525 |
| 4 | 32.2 | 80.50 | 2.2 | 39891 |
# Set independent and dependent variables
X = df[['distance', 'YearsExperience']]
y = df['Salary']
# Initialize model from sklearn and fit it into our data
regr = linear_model.LinearRegression()
model = regr.fit(X, y)
model
LinearRegression()
print('Intercept:', model.intercept_)
print('Coefficients:', model.coef_)
Intercept: -218603.37708034192 Coefficients: [3258.60769705 1303.44307882]
# Values to predict
distance = input('How much distance you have to cover for job? \n')
YearsExperience = input('How many years of experience do you have ? \n')
try:
print('We predict {:.0f}$ will be you salary if you have to cover {}m and have workin experience of {} years.'.format(
model.predict([[float(distance), float(YearsExperience)]])[0],
distance,
YearsExperience))
except ValueError:
print('Please only input either:\n- whole number e.g. 1, 4, 7\n- decimal/float number e.g. 3.8')
We predict 37360$ will be you salary if you have to cover 77.75m and have workin experience of 2 years.
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names warnings.warn(
X.shape
(30, 2)
# Prepare data
X = df[['distance', 'YearsExperience']].values.reshape(-1,2)
Y = df['Salary']
print(df["distance"].max())
print(df["distance"].min())
101.25 77.75
print(df["YearsExperience"].max())
print(df["YearsExperience"].min())
10.5 1.1
# Create range for each dimension
x = X[:, 0]
y = X[:, 1]
z = Y
xx_pred = np.linspace(77, 102, 30) # range of distance values
yy_pred = np.linspace(1, 11, 30) # range of YearsExperience values
xx_pred, yy_pred = np.meshgrid(xx_pred, yy_pred)
model_viz = np.array([xx_pred.flatten(), yy_pred.flatten()]).T
predicted = model.predict(model_viz)
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names warnings.warn(
# Evaluate model by using it's R^2 score
r2 = model.score(X, Y)
r2
#r2 score is almost near to one so we are good to go
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names warnings.warn(
0.9569566641435084
# Plot model visualization
plt.style.use('fivethirtyeight')
fig = plt.figure(figsize=(12, 4))
ax1 = fig.add_subplot(131, projection='3d')
ax2 = fig.add_subplot(132, projection='3d')
ax3 = fig.add_subplot(133, projection='3d')
axes = [ax1, ax2, ax3]
for ax in axes:
ax.plot(x, y, z, color='k', zorder=15, linestyle='none', marker='o', alpha=0.5)
ax.scatter(xx_pred.flatten(), yy_pred.flatten(), predicted, facecolor=(0,0,0,0), s=20, edgecolor='#70b3f0', )
ax.set_xlabel('distance', fontsize=12)
ax.set_ylabel('YearsExperience', fontsize=12)
ax.set_zlabel('Salary', fontsize=12)
ax.locator_params(nbins=4, axis='x')
ax.locator_params(nbins=5, axis='x')
ax1.view_init(elev=25, azim=-60)
ax2.view_init(elev=15, azim=15)
ax3.view_init(elev=25, azim=60)
fig.suptitle('Multi-Linear Regression Model Visualization ($R^2 = %.2f$)' % r2, fontsize=16, color='k')
fig.tight_layout()
#FOR accuracy
#import library
from sklearn.model_selection import train_test_split
X = df[["distance", "YearsExperience","age"]]
y = df["Salary"]
# slip the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
model = LinearRegression().fit(X_train, y_train)
model
LinearRegression()
plt.scatter (X_train[["distance"]], y_train)
plt.scatter (X_train[["YearsExperience"]], y_train)
plt.scatter (X_train[["age"]], y_train)
plt.plot(X_train, model.predict(X_train), color = "Grey" )
plt.xlabel("YearsExperience| age | distance")
plt.ylabel("Salary")
plt.title("Train Plot ")
plt.show
<function matplotlib.pyplot.show(close=None, block=None)>
#R2 test
print("Score for testing data =", model.score(X_test, y_test))
print("Score for training data =", model.score(X_train, y_train))
Score for testing data = 0.9880925772756097 Score for training data = 0.9411691490005899
model.predict([[33 ,77.75, 1.1]])
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names warnings.warn(
array([-5.09866629e+17])
y_pred =model.predict(X_test)
y_pred
array([ 40784., 122688., 64992., 63120., 115264., 107840.])
from sklearn import metrics
print (metrics.mean_absolute_error(y_test, y_pred))
2442.5
print(metrics.mean_squared_error(y_test, y_pred))
12906808.166666666
print(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
3592.604649368848
retrieved from NOAA Climate Data Online tool
# Importing libraries
# Pandas is used for data manipulation
import pandas as pd
# Read in data and display first 5 rows
df = pd.read_csv('temps.csv')
df.head(5)
| year | month | day | week | temp_2 | temp_1 | average | actual | forecast_noaa | forecast_acc | forecast_under | friend | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2016 | 1 | 1 | Fri | 45 | 45 | 45.6 | 45 | 43 | 50 | 44 | 29 |
| 1 | 2016 | 1 | 2 | Sat | 44 | 45 | 45.7 | 44 | 41 | 50 | 44 | 61 |
| 2 | 2016 | 1 | 3 | Sun | 45 | 44 | 45.8 | 41 | 43 | 46 | 47 | 56 |
| 3 | 2016 | 1 | 4 | Mon | 44 | 41 | 45.9 | 40 | 44 | 48 | 46 | 53 |
| 4 | 2016 | 1 | 5 | Tues | 41 | 40 | 46.0 | 44 | 46 | 46 | 46 | 41 |
Following are explanations of the columns:
#Checking the shape of our data that how many row and coloumns we hai
print('The shape of our features is:', df.shape)
#so we have 348 rows and 12 coloumns in our data
The shape of our features is: (348, 12)
# check if there is Nan values in here our data
df.isnull().sum()
#there is no Nan value in our data
year 0 month 0 day 0 week 0 temp_2 0 temp_1 0 average 0 actual 0 forecast_noaa 0 forecast_acc 0 forecast_under 0 friend 0 dtype: int64
# getting some information of our data
df.info()
# here we have one catagorical value column .
# As we are working in regression so we have to convert it into integer type by making the dummy data
<class 'pandas.core.frame.DataFrame'> RangeIndex: 348 entries, 0 to 347 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 year 348 non-null int64 1 month 348 non-null int64 2 day 348 non-null int64 3 week 348 non-null object 4 temp_2 348 non-null int64 5 temp_1 348 non-null int64 6 average 348 non-null float64 7 actual 348 non-null int64 8 forecast_noaa 348 non-null int64 9 forecast_acc 348 non-null int64 10 forecast_under 348 non-null int64 11 friend 348 non-null int64 dtypes: float64(1), int64(10), object(1) memory usage: 32.8+ KB
# Converting the catagorical values data using pandas get_dummies
df = pd.get_dummies(df)
# Display the first 5 rows of the last 12 columns
df.iloc[:,5:].head(5)
| average | actual | forecast_noaa | forecast_acc | forecast_under | friend | week_Fri | week_Mon | week_Sat | week_Sun | week_Thurs | week_Tues | week_Wed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 45.6 | 45 | 43 | 50 | 44 | 29 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 45.7 | 44 | 41 | 50 | 44 | 61 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 2 | 45.8 | 41 | 43 | 46 | 47 | 56 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 3 | 45.9 | 40 | 44 | 48 | 46 | 53 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 4 | 46.0 | 44 | 46 | 46 | 46 | 41 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
# again checking the shape of our data
df.shape
(348, 18)
# Use numpy to convert to arrays
import numpy as np
# Labels are the values we want to predict
labels = np.array(df['actual'])
# Remove the labels from the features
# axis 1 refers to the columns
df= df.drop('actual',axis = 1)
df= df.drop("forecast_noaa",axis = 1)
#'forecast_under','friend', 'week_Fri''week_Mon','week_Sat','week_Sun','week_Thurs','week_Tues','week_Wed')
# Convert to numpy array
features = np.array(df)
# Saving feature names for later use
df_list = list(df.columns)
df_list
['year', 'month', 'day', 'temp_2', 'temp_1', 'average', 'forecast_acc', 'forecast_under', 'friend', 'week_Fri', 'week_Mon', 'week_Sat', 'week_Sun', 'week_Thurs', 'week_Tues', 'week_Wed']
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)
Training Features Shape: (261, 16) Training Labels Shape: (261,) Testing Features Shape: (87, 16) Testing Labels Shape: (87,)
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(train_features, train_labels)
RandomForestRegressor(n_estimators=1000, random_state=42)
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)
#Checking score of testing and training data
print("Score for testing data =", rf.score(test_features, test_labels))
print("Score for training data =", rf.score(train_features, train_labels))
Score for testing data = 0.8149088174655048 Score for training data = 0.9746126233418512
# Checking the mean absolute error
from sklearn import metrics
print (metrics.mean_absolute_error(test_labels, predictions))
# Long way to do which we have done with sklearn library
# # Calculate the absolute errors
# errors = abs(predictions - test_labels)
# # Print out the mean absolute error (mae)
# print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')
3.859586206896551
# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot
# Pull out one tree from the forest
tree = rf.estimators_[5]
tree
DecisionTreeRegressor(max_features='auto', random_state=1201263687)
# Export the image to a dot file
export_graphviz(tree, out_file = 'tree.dot', feature_names = df_list, rounded = True, precision = 1)
# Use dot file to create a graph
(graph, ) = pydot.graph_from_dot_file('tree.dot')
# Write graph to a png file
graph.write_png('tree.png')
As we see the graph is so big even to visualize so lets make a new model with low no. of trees
# Limit depth of tree to 3 levels and 10 trees
rf_small = RandomForestRegressor(n_estimators=10, max_depth = 3)
rf_small.fit(train_features, train_labels)
RandomForestRegressor(max_depth=3, n_estimators=10)
# Extract the small tree
tree_small = rf_small.estimators_[5]
# Save the tree as a png image
export_graphviz(tree_small, out_file = 'small_tree.dot', feature_names = df_list, rounded = True, precision = 1)
(graph, ) = pydot.graph_from_dot_file('small_tree.dot')
graph.write_png('small_tree.png')
# for checking a polynomial regression lets firstle check what is bad fit
# bad fit
import numpy as np
import matplotlib.pyplot as plt
x = [89,43,36,36,95,10,66,34,38,20,26,29,48,64,6,5,36,66,72,40]
y = [21,46,3,35,67,95,53,72,58,10,26,34,90,33,38,20,56,2,47,15]
mymodel= np.poly1d(np.polyfit(x ,y ,3))
myline= np.linspace(2 , 95, 100)
plt.scatter(x,y)
plt.plot(myline, mymodel(myline))
plt.show
<function matplotlib.pyplot.show(close=None, block=None)>
# R Square test for bad fit
from sklearn.metrics import r2_score
x = [89,43,36,36,95,10,66,34,38,20,26,29,48,64,6,5,36,66,72,40]
y = [21,46,3,35,67,95,53,72,58,10,26,34,90,33,38,20,56,2,47,15]
model = np.poly1d(np.polyfit(x,y,3))
print(r2_score(y, model(x)))
0.009952707566680652
# R Square test for bad fit
from sklearn.metrics import r2_score
x = [89,43,36,36,95,10,66,34,38,20,26,29,48,64,6,5,36,66,72,40]
y = [21,46,3,35,67,95,53,72,58,10,26,34,90,33,38,20,56,2,47,15]
model = np.poly1d(np.polyfit(x,y,3))
print(r2_score(y, model(x)))
0.009952707566680652
# Step2 : Draw a line
import numpy as np
import matplotlib.pyplot as plt
x= [1,2,3,5,6,7,8,9,10,12,13,14,15,16,18,19,21,22]
y= [100,80,80,60,60,55,60,65,70,70,75,76,78,79,90,99,99,100]
mymodel= np.poly1d(np.polyfit(x ,y ,3))
myline= np.linspace(1 , 22, 200)
plt.scatter(x,y)
plt.plot(myline, mymodel(myline))
plt.show
<function matplotlib.pyplot.show(close=None, block=None)>
# Step 3 : R-Squared
import numpy as np
import matplotlib.pyplot as plt
x= [1,2,3,5,6,7,8,9,10,12,13,14,15,16,18,19,21,22]
y= [100,80,80,60,60,55,60,65,70,70,75,76,78,79,90,99,99,100]
model = np.poly1d(np.polyfit(x,y,3))
print(r2_score(y, model(x)))
0.9338713637130449
# Prediction
import numpy as np
import matplotlib.pyplot as plt
x= [1,2,3,5,6,7,8,9,10,12,13,14,15,16,18,19,21,22]
y= [100,80,80,60,60,55,60,65,70,70,75,76,78,79,90,99,99,100]
model = np.poly1d(np.polyfit(x,y,3))
speed = model(10)
speed
62.60788989976389
# Importing Libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# IMporting the dataset
dataset= pd.read_csv("https://s3.us-west-2.amazonaws.com/public.gamelab.fun/dataset/position_salaries.csv")
dataset.head()
| Position | Level | Salary | |
|---|---|---|---|
| 0 | Business Analyst | 1 | 45000 |
| 1 | Junior Consultant | 2 | 50000 |
| 2 | Senior Consultant | 3 | 60000 |
| 3 | Manager | 4 | 80000 |
| 4 | Country Manager | 5 | 110000 |
X = dataset.iloc[:, 1:2].values
y = dataset.iloc[:, 2].values
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
# Fitting Linear Regression to the dataset
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X,y)
# Visualizing the Linear Regression results
def viz_linear():
plt.scatter(X, y, color="red")
plt.plot(X, lin_reg.predict(X), color="blue")
plt.title("Truth or Bluff (Linear Regression)")
plt.xlabel("Position level")
plt.ylabel("Salary")
plt.show()
return
viz_linear()
# Fitting Polynomial Regression to the dataset
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree=4)
X_poly = poly_reg.fit_transform(X)
pol_reg = LinearRegression()
pol_reg.fit(X_poly, y)
LinearRegression()
# Visualizing the Polymonial Regression results
def viz_polymonial():
plt.scatter(X, y, color="red")
plt.plot(X, pol_reg.predict(poly_reg.fit_transform(X)), color="blue")
plt.title("Truth or Bluff (Linear Regression)")
plt.xlabel("Position level")
plt.ylabel("Salary")
plt.show()
return
viz_polymonial()
# Predicting a new result with Polymonial Regression
Pred_linear = lin_reg.predict([[11]])
# Predicting a new result with Polymonial Regression
Pred_ploynomial = pol_reg.predict(poly_reg.fit_transform([[11]]))
print('Linear Regression Results =', Pred_linear)
print('Polynomial Regression Results =' ,Pred_ploynomial )
print("The differene is ", Pred_ploynomial - Pred_linear)
Linear Regression Results = [694333.33333333] Polynomial Regression Results = [1780833.33333359] The differene is [1086500.00000025]
#import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
#import online data
from sklearn.datasets import load_digits
digits= load_digits()
# input variables or features (X)
digits.data.shape
X = digits.data
# means 1797 pictures size is 64 (8x8)
# output labels (y)
digits.target.shape
y= digits.target
plt.gray()
plt.matshow(digits.images[520])
<matplotlib.image.AxesImage at 0x1ca6e21fd60>
<Figure size 432x288 with 0 Axes>
plt.figure(figsize=(20,4))
for index, (image, label) in enumerate(zip(digits.data[0:5], digits.target[0:5])):
plt.subplot(1,5, index+1)
#using image(8,8) because we've seen the picsize of 64 i.e 8x8
plt.imshow(np.reshape(image,(8,8)), cmap= plt.cm.gray)
plt.title("Training : %i\n" % label, fontsize = 20)
plt.figure(figsize=(20,4))
for index, (image, label) in enumerate(zip(digits.data[0:10], digits.target[0:10])):
plt.subplot(1,10, index+1)
#using image(8,8) because we've seen the picsize of 64 i.e 8x8
plt.imshow(np.reshape(image,(8,8)), cmap= plt.cm.gray)
plt.title("Training : %i\n" % label, fontsize = 20)
plt.figure(figsize=(20,4))
for index, (image, label) in enumerate(zip(digits.data[0:10], digits.target[0:10])):
plt.subplot(1,10, index+1)
#using image(8,8) because we've seen the picsize of 64 i.e 8x8
plt.imshow(np.reshape(image,(8,8)), cmap= plt.cm.gray)
plt.title(label, fontsize = 20)
#help(plt)
#split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train,y_test = train_test_split (X, y, test_size=0.25 ,random_state= 0)
print("Train input data : " , X_train.shape)
print("Test input data : " , X_test.shape)
print("Train output data : " , y_train.shape)
print("Test output data : " , y_test.shape)
Train input data : (1347, 64) Test input data : (450, 64) Train output data : (1347,) Test output data : (450,)
# training a model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression().fit(X_train,y_train)
model
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
LogisticRegression()
print(X_test[0:10].shape)
(10, 64)
# prediction
model.predict(X_test[0:10])
array([2, 8, 2, 6, 6, 7, 1, 9, 8, 5])
y_pred = model.predict(X_test)
y_pred.shape
(450,)
# Accuracy score
score= model.score(X_test, y_test)
score
# yha pe X_test ko as an input leta hai pr X_test ki prediction ko , y_test se compare kr k k score bta rha
0.9511111111111111
# Confusion matrix
from sklearn import metrics
cm= metrics.confusion_matrix(y_test, y_pred)
cm
array([[37, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[ 0, 40, 0, 0, 0, 0, 0, 0, 2, 1],
[ 0, 1, 40, 3, 0, 0, 0, 0, 0, 0],
[ 0, 0, 0, 43, 0, 0, 0, 0, 1, 1],
[ 0, 0, 0, 0, 37, 0, 0, 1, 0, 0],
[ 0, 0, 0, 0, 0, 46, 0, 0, 0, 2],
[ 0, 1, 0, 0, 0, 0, 51, 0, 0, 0],
[ 0, 0, 0, 1, 1, 0, 0, 46, 0, 0],
[ 0, 3, 1, 0, 0, 0, 0, 0, 43, 1],
[ 0, 0, 0, 0, 0, 1, 0, 0, 1, 45]], dtype=int64)
#plotting a confusion metrix
plt.figure(figsize=(10,10))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=0.5 , square= True, cmap= "Spectral")
plt.ylabel("Actual Output")
plt.xlabel("Predicted Output")
title = "Accuracy Score : {0}".format(score)
plt.title(title, size= 15)
Text(0.5, 1.0, 'Accuracy Score : 0.9511111111111111')
plt.figure(figsize=(9,9))
plt.imshow(cm, interpolation='nearest', cmap='Pastel1')
plt.title('Confusion matrix', size = 15)
plt.colorbar()
tick_marks = np.arange(10)
plt.xticks(tick_marks, ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"], rotation=45, size = 10)
plt.yticks(tick_marks, ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"], size = 10)
plt.tight_layout()
plt.ylabel('Actual label', size = 15)
plt.xlabel('Predicted label', size = 15)
width, height = cm.shape
for x in range (width):
for y in range (height):
plt.annotate(str(cm[x][y]), xy=(y, x),
horizontalalignment='center',
verticalalignment='center')
# Getting a mis classified lables
index= 0
misclassifiedIndex = []
for label, predict in zip(y_pred , y_test):
if predict != label:
misclassifiedIndex.append(index)
print(label,predict,index)
index +=1
9 5 56 4 7 94 1 6 118 1 8 124 5 9 130 9 8 169 9 5 181 1 8 196 8 1 213 3 2 235 3 2 251 7 4 315 9 1 325 8 1 331 2 8 335 8 3 378 1 8 398 1 2 415 3 7 425 8 9 429 9 3 430 3 2 440
#error is this
# # Getting a mis classified lables
# index= 0
# misclassifiedIndex = []
# for label, predict in zip(y_pred , y_test):
# if predict != label:
# misclassifiedIndex.append(index)
# print(label,predict,index)
# index +=1
# plotting missclassified label with know
plt.figure(figsize= (20,5))
for plotIndex, badIndex in enumerate ( misclassifiedIndex[0:5]):
plt.subplot(1, 5, plotIndex + 1)
plt.imshow (np.reshape(X_test[badIndex],(8,8)), cmap= plt.cm.gray)
plt.title("Predicted :{}, Actual : {}".format (y_pred[badIndex], y_test[badIndex]), fontsize=18)
Qareebi rishtedaro wala algorithm
K= No of neighbours
Predict the response value based on the neighbour which is nearest and more in numbers
Can also be usde for the numerical data / regression data
Important
Pros:
Cons :
How to improve :
import pandas as pd
df = pd.read_csv("mldata.csv")
df["gender"]= df["gender"].replace("Male", 1)
df["gender"]= df["gender"].replace("Female", 0)
X= df[["weight","gender"]]
y= df["likeness"]
#machine learning algorithm
from sklearn.neighbors import KNeighborsClassifier
# create and fit our model
model = KNeighborsClassifier(n_neighbors= 5).fit(X,y)
model
KNeighborsClassifier()
#prediction
model.predict([[70, 1]])
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but KNeighborsClassifier was fitted with feature names warnings.warn(
array(['Biryani'], dtype=object)
model.predict(X)
array(['Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
'Biryani', 'Biryani', 'Samosa', 'Samosa', 'Biryani', 'Biryani',
'Samosa', 'Biryani', 'Biryani', 'Samosa', 'Biryani', 'Samosa',
'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Pakora', 'Biryani',
'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
'Biryani', 'Samosa', 'Biryani', 'Biryani', 'Biryani', 'Samosa',
'Biryani', 'Pakora', 'Biryani', 'Biryani', 'Biryani', 'Pakora',
'Biryani', 'Biryani', 'Biryani', 'Samosa', 'Biryani', 'Biryani',
'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
'Biryani', 'Biryani', 'Biryani', 'Pakora', 'Biryani', 'Samosa',
'Biryani', 'Pakora', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Pakora',
'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Samosa', 'Biryani',
'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
'Biryani', 'Biryani', 'Biryani', 'Samosa', 'Biryani', 'Biryani',
'Samosa', 'Biryani', 'Biryani', 'Biryani', 'Pakora', 'Pakora',
'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
'Biryani', 'Samosa', 'Pakora', 'Samosa', 'Biryani', 'Biryani',
'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Samosa', 'Biryani',
'Biryani', 'Samosa', 'Biryani', 'Pakora', 'Biryani', 'Biryani',
'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
'Biryani', 'Samosa', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
'Biryani', 'Biryani', 'Pakora', 'Biryani', 'Biryani', 'Biryani',
'Biryani', 'Biryani', 'Samosa', 'Biryani', 'Biryani', 'Biryani',
'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Samosa', 'Biryani',
'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Samosa', 'Biryani',
'Biryani', 'Biryani', 'Samosa', 'Biryani', 'Biryani', 'Biryani',
'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Samosa', 'Biryani',
'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani'],
dtype=object)
# split data into test and train (80/20 rule)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2 ,random_state= 0)
# create a model
model = KNeighborsClassifier(n_neighbors=12)
# fittine a model
model.fit(X_train, y_train)
predicted_values = model.predict(X_test)
predicted_values
array(['Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
'Biryani'], dtype=object)
#checkin score
from sklearn.metrics import accuracy_score
score = accuracy_score(y_test, predicted_values)
score
0.6530612244897959
from sklearn.metrics import f1_score
score = f1_score(y_test, predicted_values,average='weighted')
score
0.5159989921894684
from sklearn.metrics import precision_score
score = precision_score(y_test, predicted_values ,average='macro')
score
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\metrics\_classification.py:1318: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
0.21768707482993196
from sklearn.metrics import recall_score
score = recall_score(y_test, predicted_values,average='micro')
score
0.6530612244897959
Decision Trees (DTs) are a non-parametric(in which data is not normalized) supervised learning method used for classification and regression.
# Descion tree classifier me hum do ya do se ziada parameters ko lene k baad hum class define krte hain
import pandas as pd
df = pd.read_csv("mldata.csv
df.head()
| age | height | weight | gender | likeness | |
|---|---|---|---|---|---|
| 0 | 27 | 170.688 | 76.0 | Male | Biryani |
| 1 | 41 | 165.000 | 70.0 | Male | Biryani |
| 2 | 29 | 171.000 | 80.0 | Male | Biryani |
| 3 | 27 | 173.000 | 102.0 | Male | Biryani |
| 4 | 29 | 164.000 | 67.0 | Male | Biryani |
#catagorical value ko pehle change krna hota
df["gender"]= df["gender"].replace("Male", 1)
df["gender"]= df["gender"].replace("Female", 0)
df.tail()
| age | height | weight | gender | likeness | |
|---|---|---|---|---|---|
| 240 | 31 | 160.0 | 60.0 | 1 | Pakora |
| 241 | 26 | 172.0 | 70.0 | 1 | Biryani |
| 242 | 40 | 178.0 | 80.0 | 1 | Biryani |
| 243 | 25 | 5.7 | 65.0 | 1 | Biryani |
| 244 | 33 | 157.0 | 56.0 | 0 | Samosa |
# catagorical variable ko define krne k liey decision tree classifier ka use krna parta hai
# selection of input and output variables
X= df[["weight","gender"]]
y= df["likeness"]
X.head()
| weight | gender | |
|---|---|---|
| 0 | 76.0 | 1 |
| 1 | 70.0 | 1 |
| 2 | 80.0 | 1 |
| 3 | 102.0 | 1 |
| 4 | 67.0 | 1 |
y.head()
0 Biryani 1 Biryani 2 Biryani 3 Biryani 4 Biryani Name: likeness, dtype: object
#machine learning algorithm
from sklearn.tree import DecisionTreeClassifier
# create and fit our model
model = DecisionTreeClassifier().fit(X,y)
#prediction
model.predict([[80, 1]])
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but DecisionTreeClassifier was fitted with feature names warnings.warn(
array(['Biryani'], dtype=object)
model.predict([[23, 1]])[0]
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but DecisionTreeClassifier was fitted with feature names warnings.warn(
'Biryani'
# how to measure the accurary of our model
# slit data into test and train (80/20 rule)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2 , random_state= 0)
# create a model
model = DecisionTreeClassifier()
# fittine a model
model.fit(X_train, y_train)
predicted_values = model.predict(X_test)
predicted_values
#checkin score
score = accuracy_score(y_test, predicted_values)
score
0.6122448979591837
import seaborn as sns
import matplotlib.pyplot as plt
sns.boxplot(x= X_train["weight"],hue= X_train["gender"] ,y= y_train, data= df)
<AxesSubplot:xlabel='weight', ylabel='likeness'>
# HOw to train and save your model
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
import joblib
model = DecisionTreeClassifier().fit(X,y)
joblib.dump(model, "foodie.joblib")
['foodie.joblib']
#graph
from sklearn import tree
model = DecisionTreeClassifier().fit(X,y)
#graph
from sklearn import tree
model = DecisionTreeClassifier().fit(X,y)
# model.fit(X, y)
# graphic evaluation / look into what happened
tree.export_graphviz(model,
out_file="foodie.dot",
feature_names=["age", "gender"],
class_names = sorted( y.unique()),
label= "all",
rounded= True,
filled =True)
#Load data set
import pandas as pd
import seaborn as sns
import numpy as np
df1= sns.load_dataset("iris")
df1.head()
| sepal_length | sepal_width | petal_length | petal_width | species | |
|---|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
# df1 mtlb ye wala data ..Iloc mtb kis type ka data chahiey us me se
#then square bracket me pehle row ati and : ka mtlb k sari rows aa gai
# then comma , and then column ate ..hum ne kha ki : mtlb sare column aa jey -1 last wale k ilawa
X= df1.iloc[: , :-1]
# and yhe pe -1 likha k sirf last wala column : mtlb baki sare column chor k
y= df1.iloc[:, -1:]
# agr hum sif -1 likhe ge to wo us column ki values show kr de ga ..but humko us column ka index b chahiey
X.head()
| sepal_length | sepal_width | petal_length | petal_width | |
|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 |
y.head()
| species | |
|---|---|
| 0 | setosa |
| 1 | setosa |
| 2 | setosa |
| 3 | setosa |
| 4 | setosa |
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
model = DecisionTreeClassifier().fit(X,y)
model
DecisionTreeClassifier()
plot_tree(model , filled = True)
plt.title("Decision tree trained model of IRIS data")
# FOllowing codes are to sav
# #saving high resolution png
# plt.savefig('saving-a-high-resolution-plot.png', dpi=300)
# #saving high resolution png with transparent
# plt.savefig('saving-a-plot-as-png-file-transparent.png',dpi=400, transparent=True)
# #saving a pdf file a with high resolution
# plt.savefig('saving-a-plot-in-pdf.pdf', dpi=400)
# #saving a tiff file with high resolution
# plt.savefig('saving-a-plot-as-tiff-file.tiff', dpi=500)
plt.show()
# load libraires adn data set
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df= sns.load_dataset("iris")
df.head()
| sepal_length | sepal_width | petal_length | petal_width | species | |
|---|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
X= df.iloc[: ,:-1]
y= df.iloc[:, -1:]
from sklearn.ensemble import RandomForestClassifier
#n_estimators default 100
# The Number of trees in forest
model = RandomForestClassifier(n_estimators=100)
model.fit(X,y)
model.predict([[5,4,2,6]])
C:\Users\Epazz\AppData\Local\Temp/ipykernel_10720/889084137.py:5: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel(). model.fit(X,y) C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but RandomForestClassifier was fitted with feature names warnings.warn(
array(['setosa'], dtype=object)
# IMporting library
from sklearn.model_selection import train_test_split
# splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
y_pred= model.predict(X_test)
y_pred
array(['setosa', 'virginica', 'versicolor', 'setosa', 'setosa', 'setosa',
'versicolor', 'setosa', 'virginica', 'versicolor', 'virginica',
'virginica', 'virginica', 'setosa', 'virginica', 'virginica',
'versicolor', 'versicolor', 'versicolor', 'setosa', 'setosa',
'versicolor', 'setosa', 'versicolor', 'versicolor', 'virginica',
'virginica', 'versicolor', 'versicolor', 'setosa', 'virginica',
'virginica', 'virginica', 'setosa', 'versicolor', 'setosa',
'setosa', 'versicolor'], dtype=object)
# Accuracy test
score = model.score(X_test, y_test)
score
1.0
from sklearn import metrics
cm = metrics.confusion_matrix(y_test, y_pred)
cm
array([[13, 0, 0],
[ 0, 13, 0],
[ 0, 0, 12]], dtype=int64)
#plotting a confusion metrix
plt.figure(figsize=(10,10))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=0.5 , square= True, cmap= "Spectral")
plt.ylabel("Actual Output")
plt.xlabel("Predicted Output")
title = "Accuracy Score : {0}".format(score)
plt.title(title, size= 15)
Text(0.5, 1.0, 'Accuracy Score : 1.0')
There are three types of Naive Bayes model under the scikit-learn library:
Gaussian: It is used in classification and it assumes that features follow a normal distribution.\ When the values of predictors are continuous in nature and it is assumed that they follow Gaussian distribution.
Multinomial: It is used for discrete counts.\ when the predictors are boolean in nature and it is assumed they follow Bernoulli distribution.\ For example, let’s say, we have a text classification problem. Here we can consider Bernoulli trials which is one step further and instead of “word occurring in the document”, we have “count how often word occurs in the document”, you can think of it as “number of times outcome number x_i is observed over the n trials”.
Bernoulli: The binomial model is useful if your feature vectors are binary (i.e. zeros and ones). One application would be text classification with ‘bag of words’ model where the 1s & 0s are “word occurs in the document” and “word does not occur in the document” respectively.\ mostly used for document or text classification problems
# load libraires adn data set
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Load data set from Seaborn library
df= sns.load_dataset("iris")
df.head()
| sepal_length | sepal_width | petal_length | petal_width | species | |
|---|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
#Defining features and lables at X and y respectively
X= df.iloc[: ,:-1]
y= df.iloc[:, -1:]
# Checking our variables
print(X.head())
print(y.head())
sepal_length sepal_width petal_length petal_width 0 5.1 3.5 1.4 0.2 1 4.9 3.0 1.4 0.2 2 4.7 3.2 1.3 0.2 3 4.6 3.1 1.5 0.2 4 5.0 3.6 1.4 0.2 species 0 setosa 1 setosa 2 setosa 3 setosa 4 setosa
# splitting X and y into training and testing sets at 80/20 rule
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
# training the model on training set
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\validation.py:993: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True)
GaussianNB()
# making predictions on the testing set
y_pred = gnb.predict(X_test)
# comparing actual response values (y_test) with predicted response values (y_pred)
from sklearn import metrics
print("Gaussian Naive Bayes model accuracy(in %):", metrics.accuracy_score(y_test, y_pred)*100)
Gaussian Naive Bayes model accuracy(in %): 96.66666666666667
#Import scikit-learn dataset library
from sklearn import datasets
#Load dataset
cancer = datasets.load_breast_cancer()
type(cancer)
sklearn.utils.Bunch
# print the names of the 30 features
print("Features: ", cancer.feature_names)
# print the label type of cancer('malignant' "benign')
print("Labels: ", cancer.target_names)
Features: ['mean radius' 'mean texture' 'mean perimeter' 'mean area' 'mean smoothness' 'mean compactness' 'mean concavity' 'mean concave points' 'mean symmetry' 'mean fractal dimension' 'radius error' 'texture error' 'perimeter error' 'area error' 'smoothness error' 'compactness error' 'concavity error' 'concave points error' 'symmetry error' 'fractal dimension error' 'worst radius' 'worst texture' 'worst perimeter' 'worst area' 'worst smoothness' 'worst compactness' 'worst concavity' 'worst concave points' 'worst symmetry' 'worst fractal dimension'] Labels: ['malignant' 'benign']
# Print data (features) shape
cancer.data.shape
(569, 30)
#print the cancer data features (top 5 records)
print(cancer.data[0:5])
[[1.799e+01 1.038e+01 1.228e+02 1.001e+03 1.184e-01 2.776e-01 3.001e-01 1.471e-01 2.419e-01 7.871e-02 1.095e+00 9.053e-01 8.589e+00 1.534e+02 6.399e-03 4.904e-02 5.373e-02 1.587e-02 3.003e-02 6.193e-03 2.538e+01 1.733e+01 1.846e+02 2.019e+03 1.622e-01 6.656e-01 7.119e-01 2.654e-01 4.601e-01 1.189e-01] [2.057e+01 1.777e+01 1.329e+02 1.326e+03 8.474e-02 7.864e-02 8.690e-02 7.017e-02 1.812e-01 5.667e-02 5.435e-01 7.339e-01 3.398e+00 7.408e+01 5.225e-03 1.308e-02 1.860e-02 1.340e-02 1.389e-02 3.532e-03 2.499e+01 2.341e+01 1.588e+02 1.956e+03 1.238e-01 1.866e-01 2.416e-01 1.860e-01 2.750e-01 8.902e-02] [1.969e+01 2.125e+01 1.300e+02 1.203e+03 1.096e-01 1.599e-01 1.974e-01 1.279e-01 2.069e-01 5.999e-02 7.456e-01 7.869e-01 4.585e+00 9.403e+01 6.150e-03 4.006e-02 3.832e-02 2.058e-02 2.250e-02 4.571e-03 2.357e+01 2.553e+01 1.525e+02 1.709e+03 1.444e-01 4.245e-01 4.504e-01 2.430e-01 3.613e-01 8.758e-02] [1.142e+01 2.038e+01 7.758e+01 3.861e+02 1.425e-01 2.839e-01 2.414e-01 1.052e-01 2.597e-01 9.744e-02 4.956e-01 1.156e+00 3.445e+00 2.723e+01 9.110e-03 7.458e-02 5.661e-02 1.867e-02 5.963e-02 9.208e-03 1.491e+01 2.650e+01 9.887e+01 5.677e+02 2.098e-01 8.663e-01 6.869e-01 2.575e-01 6.638e-01 1.730e-01] [2.029e+01 1.434e+01 1.351e+02 1.297e+03 1.003e-01 1.328e-01 1.980e-01 1.043e-01 1.809e-01 5.883e-02 7.572e-01 7.813e-01 5.438e+00 9.444e+01 1.149e-02 2.461e-02 5.688e-02 1.885e-02 1.756e-02 5.115e-03 2.254e+01 1.667e+01 1.522e+02 1.575e+03 1.374e-01 2.050e-01 4.000e-01 1.625e-01 2.364e-01 7.678e-02]]
# print the cancer labels (0:malignant, 1:benign)
print(cancer.target)
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 0 0 1 0 0 1 1 1 1 0 1 0 0 1 1 1 1 0 1 0 0 1 0 1 0 0 1 1 1 0 0 1 0 0 0 1 1 1 0 1 1 0 0 1 1 1 0 0 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 0 0 0 1 0 0 1 1 1 0 0 1 0 1 0 0 1 0 0 1 1 0 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 1 0 1 1 0 0 1 1 0 0 1 1 1 1 0 1 1 0 0 0 1 0 1 0 1 1 1 0 1 1 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 1 1 0 0 1 1 1 0 1 1 1 1 1 0 0 1 1 0 1 1 0 0 1 0 1 1 1 1 0 1 1 1 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 1 0 1 1 0 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 0 1 1 1 1 0 0 0 1 1 1 1 0 1 0 1 0 1 1 1 0 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 1 0 0 0 1 0 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 0 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 0 1 1 1 1 1 0 1 1 0 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 0 1 1 0 1 1 1 1 1 0 0 1 0 1 0 1 1 1 1 1 0 1 1 0 1 0 1 0 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 1]
# Import train_test_split function
from sklearn.model_selection import train_test_split
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, test_size=0.2, random_state=0)
#Import svm model
from sklearn import svm
#Create a svm Classifier
clf = svm. SVC(kernel='linear') # Linear Kernel
#Train the model using the training sets
clf.fit(X_train, y_train)
#Predict the response for test dataset
y_pred = clf.predict(X_test)
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
score = metrics.accuracy_score(y_test, y_pred)
# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
Accuracy: 0.956140350877193
# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:", metrics.precision_score(y_test, y_pred))
# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:", metrics.recall_score(y_test, y_pred))
Precision: 0.984375 Recall: 0.9402985074626866
# confusion matrix
from sklearn import metrics
cm = metrics.confusion_matrix(y_test, y_pred)
cm
array([[46, 1],
[ 4, 63]], dtype=int64)
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(12,12))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Spectral')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
all_sample_title = 'SVM model accuracy(in %): {0}'.format(score*100)
plt.title(all_sample_title, size = 15)
Text(0.5, 1.0, 'SVM model accuracy(in %): 95.6140350877193')
Clusters: A group of object that are similar to other subjects in the clusters adn dis-similar to other data point in other clusters
| Clustering | Classification |
|---|---|
| Un Supervised | Supervised |
| No training data | Labeled data |
| Unalbeled data | Labeled data |
| Define labels using data then in algorithm | Data>Model>Training>Classification |
It is the simplest and commonly used iterative type unsupervised learning algorithm. In this, we randomly initialize the K number of centroids in the data and iterates these centroids until no change happens to the position of the centroid. Let’s go through the steps involved in K means clustering for a better understanding.
1) Select the number of clusters for the dataset ( K )
2) Select K number of centroids
3) By calculating the Euclidean distance or Manhattan distance assign the points to the nearest centroid, thus creating K groups
4) Now find the original centroid in each group
5) Again reassign the whole data point based on this new centroid, then repeat step 4 until the position of the centroid doesn’t change.
Explained in Example 3
#importing libraries
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import numpy as np
from sklearn.cluster import KMeans
import sklearn.datasets._samples_generator
#The code below will build a 2D dataset with four blobs.
from sklearn.datasets._samples_generator import make_blobs
X, y_true = make_blobs(n_samples=400, centers=4, cluster_std=0.60, random_state=0)
#visualizing the dataset.
plt.scatter(X[:, 0], X[:, 1], s=10); # S is size of dots in data we are visualizing
plt.show()
#create a K – means object while specifying the number of clusters, train the model, and estimate as
kmeans = KMeans(n_clusters=4)
kmeans.fit(X)
y_kmeans = kmeans.predict(X)
#plot and visualize the cluster’s centers as determined by the k-means Python estimator
plt.scatter(X[:, 0], X[:, 1], c=y_kmeans, s=20, cmap='summer')
centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='blue', s=100, alpha=0.9);
plt.show()
we will use K-means clustering on a simple digit’s dataset. Without relying on the original label information, K-means will try to identify numbers that are similar.
# Importing libraries
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import numpy as np
from sklearn.cluster import KMeans
#Load the digit dataset from sklearn and create an object out of it.
#Additionally, we can get the total number of rows and the total number of columns in this dataset
from sklearn.datasets import load_digits
digits = load_digits()
digits.data.shape
(1797, 64)
#We may cluster the data in the same way that we did in Example 1 above
kmeans = KMeans(n_clusters=10, random_state=0)
clusters = kmeans.fit_predict(digits.data)
# Checking how many clusters are created by K-means and how many feature does we have
kmeans.cluster_centers_.shape
#indicates that K-means generated 10 clusters with 64 features
(10, 64)
# Checking the centers of cluster
fig, ax = plt.subplots(2, 5, figsize=(8, 3))
centers = kmeans.cluster_centers_.reshape(10, 8, 8)
for axi, center in zip(ax.flat, centers):
axi.set(xticks=[], yticks=[])
axi.imshow(center, interpolation='nearest', cmap=plt.cm.binary)
#As a result, we will receive the picture below, which shows clusters centers learned by k-means.
# Checking the learned cluster labels with the actual labels
from scipy.stats import mode
labels = np.zeros_like(clusters)
for i in range(10):
mask = (clusters == i)
labels[mask] = mode(digits.target[mask])[0]
# Following that, we can check the accuracy as follows:
from sklearn.metrics import accuracy_score
accuracy_score(digits.target, labels)
0.7935447968836951
# Importing libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
#import the dataset and slice the important features
dataset = pd.read_csv('Mall_Customers.csv')
dataset.head()
| CustomerID | Genre | Age | Annual Income (k$) | Spending Score (1-100) | |
|---|---|---|---|---|---|
| 0 | 1 | Male | 19 | 15 | 39 |
| 1 | 2 | Male | 21 | 15 | 81 |
| 2 | 3 | Female | 20 | 16 | 6 |
| 3 | 4 | Female | 23 | 16 | 77 |
| 4 | 5 | Female | 31 | 17 | 40 |
X = dataset.iloc[:, [3, 4]].values
X
array([[ 15, 39],
[ 15, 81],
[ 16, 6],
[ 16, 77],
[ 17, 40],
[ 17, 76],
[ 18, 6],
[ 18, 94],
[ 19, 3],
[ 19, 72],
[ 19, 14],
[ 19, 99],
[ 20, 15],
[ 20, 77],
[ 20, 13],
[ 20, 79],
[ 21, 35],
[ 21, 66],
[ 23, 29],
[ 23, 98],
[ 24, 35],
[ 24, 73],
[ 25, 5],
[ 25, 73],
[ 28, 14],
[ 28, 82],
[ 28, 32],
[ 28, 61],
[ 29, 31],
[ 29, 87],
[ 30, 4],
[ 30, 73],
[ 33, 4],
[ 33, 92],
[ 33, 14],
[ 33, 81],
[ 34, 17],
[ 34, 73],
[ 37, 26],
[ 37, 75],
[ 38, 35],
[ 38, 92],
[ 39, 36],
[ 39, 61],
[ 39, 28],
[ 39, 65],
[ 40, 55],
[ 40, 47],
[ 40, 42],
[ 40, 42],
[ 42, 52],
[ 42, 60],
[ 43, 54],
[ 43, 60],
[ 43, 45],
[ 43, 41],
[ 44, 50],
[ 44, 46],
[ 46, 51],
[ 46, 46],
[ 46, 56],
[ 46, 55],
[ 47, 52],
[ 47, 59],
[ 48, 51],
[ 48, 59],
[ 48, 50],
[ 48, 48],
[ 48, 59],
[ 48, 47],
[ 49, 55],
[ 49, 42],
[ 50, 49],
[ 50, 56],
[ 54, 47],
[ 54, 54],
[ 54, 53],
[ 54, 48],
[ 54, 52],
[ 54, 42],
[ 54, 51],
[ 54, 55],
[ 54, 41],
[ 54, 44],
[ 54, 57],
[ 54, 46],
[ 57, 58],
[ 57, 55],
[ 58, 60],
[ 58, 46],
[ 59, 55],
[ 59, 41],
[ 60, 49],
[ 60, 40],
[ 60, 42],
[ 60, 52],
[ 60, 47],
[ 60, 50],
[ 61, 42],
[ 61, 49],
[ 62, 41],
[ 62, 48],
[ 62, 59],
[ 62, 55],
[ 62, 56],
[ 62, 42],
[ 63, 50],
[ 63, 46],
[ 63, 43],
[ 63, 48],
[ 63, 52],
[ 63, 54],
[ 64, 42],
[ 64, 46],
[ 65, 48],
[ 65, 50],
[ 65, 43],
[ 65, 59],
[ 67, 43],
[ 67, 57],
[ 67, 56],
[ 67, 40],
[ 69, 58],
[ 69, 91],
[ 70, 29],
[ 70, 77],
[ 71, 35],
[ 71, 95],
[ 71, 11],
[ 71, 75],
[ 71, 9],
[ 71, 75],
[ 72, 34],
[ 72, 71],
[ 73, 5],
[ 73, 88],
[ 73, 7],
[ 73, 73],
[ 74, 10],
[ 74, 72],
[ 75, 5],
[ 75, 93],
[ 76, 40],
[ 76, 87],
[ 77, 12],
[ 77, 97],
[ 77, 36],
[ 77, 74],
[ 78, 22],
[ 78, 90],
[ 78, 17],
[ 78, 88],
[ 78, 20],
[ 78, 76],
[ 78, 16],
[ 78, 89],
[ 78, 1],
[ 78, 78],
[ 78, 1],
[ 78, 73],
[ 79, 35],
[ 79, 83],
[ 81, 5],
[ 81, 93],
[ 85, 26],
[ 85, 75],
[ 86, 20],
[ 86, 95],
[ 87, 27],
[ 87, 63],
[ 87, 13],
[ 87, 75],
[ 87, 10],
[ 87, 92],
[ 88, 13],
[ 88, 86],
[ 88, 15],
[ 88, 69],
[ 93, 14],
[ 93, 90],
[ 97, 32],
[ 97, 86],
[ 98, 15],
[ 98, 88],
[ 99, 39],
[ 99, 97],
[101, 24],
[101, 68],
[103, 17],
[103, 85],
[103, 23],
[103, 69],
[113, 8],
[113, 91],
[120, 16],
[120, 79],
[126, 28],
[126, 74],
[137, 18],
[137, 83]], dtype=int64)
n the Elbow method, we are actually varying the number of clusters ( K ) from 1 – 10. For each value of K, we are calculating WCSS ( Within-Cluster Sum of Square ). WCSS is the sum of squared distance between each point and the centroid in a cluster. When we plot the WCSS with the K value, the plot looks like an Elbow. As the number of clusters increases, the WCSS value will start to decrease. WCSS value is largest when K = 1. When we analyze the graph we can see that the graph will rapidly change at a point and thus creating an elbow shape. From this point, the graph starts to move almost parallel to the X-axis. The K value corresponding to this point is the optimal K value or an optimal number of clusters.
#find the optimal K value for clustering the data.
#Now we are using the Elbow method to find the optimal K value.
from sklearn.cluster import KMeans
wcss = []
for i in range(1, 11):
kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
#“init” argument is the method for initializing the centroid
kmeans.fit(X)
wcss.append(kmeans.inertia_)
#We calculated the WCSS value for each K value.
# Now we have to plot the WCSS with K value
plt.plot(range(1, 11), wcss)
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()
#The point at which the elbow shape is created is 5, that is, our K value or an optimal number of clusters is 5.
#Now let’s train the model on the dataset with a number of clusters 5.
kmeans = KMeans(n_clusters = 5, init = "k-means++", random_state = 42)
y_kmeans = kmeans.fit_predict(X)
y_kmeans
#y_kmeans give us different clusters corresponding to X.
array([2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 0,
2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 1, 4, 0, 4, 1, 4, 1, 4,
0, 4, 1, 4, 1, 4, 1, 4, 1, 4, 0, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4,
1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4,
1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4,
1, 4])
#Now let’s plot all the clusters using matplotlib.
plt.scatter(X[y_kmeans == 0, 0], X[y_kmeans == 0, 1], s = 60, c = 'red', label = 'Cluster1')
plt.scatter(X[y_kmeans == 1, 0], X[y_kmeans == 1, 1], s = 60, c = 'blue', label = 'Cluster2')
plt.scatter(X[y_kmeans == 2, 0], X[y_kmeans == 2, 1], s = 60, c = 'green', label = 'Cluster3')
plt.scatter(X[y_kmeans == 3, 0], X[y_kmeans == 3, 1], s = 60, c = 'violet', label = 'Cluster4')
plt.scatter(X[y_kmeans == 4, 0], X[y_kmeans == 4, 1], s = 60, c = 'yellow', label = 'Cluster5')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 100, c = 'black', label = 'Centroids')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()